Python clear_space_str示例

编程语言: Python

命名空间/包名称: scrapySchool_England.clearSpace

方法/功能: clear_space_str

hotexamples.com的示例: 30

Python clear_space_str - 已找到30个示例。这些是从开源项目中提取的最受好评的scrapySchool_England.clearSpace.clear_space_str现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： UniversityOfCambridge_P.py 项目： histudent/python_spider

    def parse_apply_proces_en(self, how_to_apply_url):
        data = requests.get(how_to_apply_url, headers=self.headers)
        response = etree.HTML(data.text)
        # print(response)
        apply_proces_en = response.xpath(
            "//div[@class='field field-name-field-gao-course-apply field-type-text-long field-label-hidden']"
        )
        # 将Element转换成HTML格式
        apply = ""
        if len(apply_proces_en) > 0:
            apply = etree.tostring(apply_proces_en[0],
                                   encoding='unicode',
                                   pretty_print=False,
                                   method='html')
            apply = remove_class(clear_space_str(apply))

        apply_documents_en = response.xpath(
            '//h2[contains(text(),"Things")]/preceding-sibling::*[1]/following-sibling::*'
        )
        # 将Element转换成HTML格式
        apply_documents = ""
        if len(apply_documents_en) > 0:
            for d in apply_documents_en:
                apply_documents += etree.tostring(d,
                                                  encoding='unicode',
                                                  pretty_print=False,
                                                  method='html')
        apply_documents = remove_class(clear_space_str(apply_documents))
        return [apply, apply_documents]

示例#2

显示文件

文件： TheUniversityOfEdinburgh_R.py 项目： histudent/python_spider

 def get_modules2(self, modules2url):
     data = requests.get(modules2url, headers=self.headers)
     response = etree.HTML(data.text)
     modules2 = response.xpath("/html/body/div[@class='container']")
     m2 = etree.tostring(modules2[0], encoding='unicode', pretty_print=False, method='html')
     m2 = remove_class(clear_space_str(m2))
     return m2

示例#3

显示文件

文件： UniversityOfSurrey_R.py 项目： histudent/python_spider

 def parse_apply_proces_en(self, how_to_apply_url):
     data = requests.get(how_to_apply_url, headers=self.headers)
     response = etree.HTML(data.text)
     # print(response)
     apply_proces_en = response.xpath(
         "//div[@class='layout-row intro summary']")
     # 将Element转换成HTML格式
     apply = etree.tostring(apply_proces_en[0],
                            encoding='unicode',
                            pretty_print=False,
                            method='html')
     apply = remove_class(clear_space_str(apply))
     return apply

示例#4

显示文件

文件： UniversityOfCambridge_P.py 项目： histudent/python_spider

 def parse_assessment_en(self, teaching_assessment_url):
     data = requests.get(teaching_assessment_url, headers=self.headers)
     response = etree.HTML(data.text)
     # print(response)
     assessment_en = response.xpath(
         "//div[@class='field field-name-field-gao-course-study field-type-text-long field-label-hidden']"
     )
     ass = etree.tostring(assessment_en[0],
                          encoding='unicode',
                          pretty_print=False,
                          method='html')
     # print("************", assessment_en)
     # print(ass)
     ass = remove_class(clear_space_str(ass))
     # print(ass)
     return ass

示例#5

显示文件

文件： KinstonUniversity_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Kingston University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en #4.degree_name
        programme_en = response.xpath('//*[@id="middle-col"]/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        programme_en = programme_en.replace('&amp; ','')
        if '(' in programme_en:
            degree_name_a = re.findall(r'[A-Za-z/]+\(Hons\)',programme_en)[0]
            degree_name = degree_name_a.replace('(Hons)','')
        else:
            degree_name_a = ''
            degree_name = ''
        if len(degree_name_a)!=0:
            programme_en = programme_en.replace(degree_name_a,'')
        programme_en = programme_en.replace('  ',' ')
        # print(programme_en)
        # print(degree_name)

        #5.degree_type
        degree_type = 1


        #6.start_date
        start_date = '2018-9,2019-1,2019-4'

        #7.overview_en
        overview_en = response.xpath("//h2[contains(text(),'What you will study')]/preceding-sibling::*").extract()
        overview_en = ''.join(overview_en)
        overview_en = clear_space_str(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #8.assessment_en
        assessment_url = url +'teaching-learning-assessment.html'
        # print(assessment_url)
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        data = requests.get(assessment_url, headers=headers)
        response_assessment_en = etree.HTML(data.text)
        assessment_en = response_assessment_en.xpath('//*[@id="middle-col"]/div[2]/p//text()')
        assessment_en = ''.join(assessment_en)
        # print(assessment_en)

        #9.modules_en
        modules_en =response.xpath('//*[@id="modulelist"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #10.alevel
        alevel_url = url+'entry-requirements.html'
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        data = requests.get(alevel_url, headers=headers)
        response_alevel = etree.HTML(data.text)
        alevel = response_alevel.xpath("//*[contains(text(),'evel')]/.//text()")
        alevel = ''.join(alevel)
        # print(alevel,url)

        #11.ielts 12131415
        if 'Health' in programme_en:
            ielts = 6.5
            ielts_r = 5.5
            ielts_w = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Social Care' in programme_en:
            ielts = 6.5
            ielts_r = 5.5
            ielts_w = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Education' in programme_en:
            ielts = 6.5
            ielts_r = 5.5
            ielts_w = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Journalism' in programme_en:
            ielts = 6.5
            ielts_r = 5.5
            ielts_w = 6.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Nursing' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Nutrition' in programme_en:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0
        else:
            ielts = 6.0
            ielts_r = 5.5
            ielts_w = 6.5
            ielts_l = 5.5
            ielts_s = 5.5

        #16.tuition_fee
        tuition_fee_url = url+'fees-and-funding.html'
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        data = requests.get(tuition_fee_url, headers=headers)
        response_tuition_fee = etree.HTML(data.text)
        tuition_fee = response_tuition_fee.xpath('//*[@id="middle-col"]/div[2]/table/tbody/tr[3]/td[2]//text()')
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)


        #17.tuition_fee_pre
        tuition_fee_pre = '£'

        #18.apply_proces_en
        apply_proces_en = response.url +'apply-now.html'
        # print(apply_proces_en)

        #19.career_en
        career_en_url =  url+'after-you-graduate.html'
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        data = requests.get(career_en_url, headers=headers)
        response_career_en = etree.HTML(data.text)
        career_en = response_career_en.xpath("//h2[contains(text(),'Careers and progression')]/../..")
        doc = ""
        if len(career_en) > 0:
            for a in career_en:
                doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html'))
                doc = remove_class(doc)
        career_en = ''.join(doc)
        # print(career_en)

        #20.location
        location = 'London'

        #21.apply_pre
        apply_pre = '£'


        item['alevel'] = alevel
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['start_date'] = start_date
        item['overview_en'] = overview_en
        item['assessment_en'] = assessment_en
        item['modules_en'] = modules_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_proces_en'] = apply_proces_en
        item['career_en'] = career_en
        item['location'] = location

        # 22.ucascode
        ucascode = response.xpath('//*[@id="middle-col"]//table//tr/td').extract()
        ucascode = ''.join(ucascode)
        # ucascode = remove_tags(ucascode)
        # ucascode = re.findall(r'<td>(.*)</td>',ucascode)
        ucas = re.findall('td>([A-Z][A-Z0-9]{3})', ucascode)
        # if len(ucas) == 1:
        #     ucascode = ucas[0]
        # elif len(ucascode)>1:
        #     ucascode = ''.join(ucas)
        # else:ucascode = ''
        print(ucas,'---')
        item['duration'] = None
        item['other'] = ''
        item['ucascode'] = ''
        if len(ucas) > 0:
            response_duration = []
            for i in ucas:
                response_ucascode = i
                xpaths = '//*[contains(text(),' + str(response_ucascode) + ')]//preceding-sibling::td[contains(text(),"full time")]'
                response_duration = response.xpath(xpaths).extract()
            # print(response_duration, '===')

            if len(ucas) == len(response_duration):
                for j in range(len(ucas)):
                    duration_major = response_duration[j].replace('<td>','').replace('</td>','')
                    duration = re.findall('\d', duration_major)[0]
                    item['duration'] = duration
                    item['other'] = duration_major
                    item['ucascode'] = ucas[j]

                    print("==========================", str(j))
                    print(item['duration'] , '---')
                    print(item['other'] , '---')
                    print(item['ucascode'] , '---')

                    yield item
            else:

                yield item
        else:

            yield item

示例#6

显示文件

文件： LondonMetropolitanUniversity_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'London Metropolitan University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #4.programme_en
        programme_en = response.xpath(
            '//*[@id="MainContent"]/div[1]/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #5.degree_type
        degree_type = 1

        #6.degree_name
        # degree_name = re.findall(r'-\s(.*)',programme_en)[0]
        # programme_en = programme_en.replace(degree_name,'').replace('-','').strip()
        # print(degree_name)
        # print(programme_en)

        #7.alevel
        alevel = response.xpath(
            '//*[@id="entry-requirements"]/div/ul/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        alevel = clear_space_str(alevel)
        # print(alevel)

        #8.overview_en
        overview_en = response.xpath(
            '//*[@id="LeftColumn"]/section/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #9.start_date
        start_date = '2018-8-18'

        #10.apply_pre
        apply_pre = '£'

        #11.duration
        try:
            duration = response.xpath(
                "//*[contains(text(),'September 2019 - Full-time')]//@data-duration"
            ).extract()[0]
            duration = ''.join(duration)
            if len(duration) == 0:
                duration = response.xpath(
                    "//*[contains(text(),'September 2018 - Full-time')]//@data-duration"
                )[0]
                duration = ''.join(duration)
        except:
            duration = ''
        # print(duration,response.url)

        #12.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'September 2019 - Full-time')]//@data-cost"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        if len(tuition_fee) == 0:
            tuition_fee = response.xpath(
                "//*[contains(text(),'September 2018 - Full-time')]//@data-cost"
            ).extract()
            tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #13.location
        # location = response.xpath("//*[contains(text(),'Location')]//following-sibling::*[1]").extract()[0]
        # location = ''.join(location)
        # location = remove_tags(location).replace('Location:','').strip()
        # print(location)

        #14.apply_documents_en
        apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>'

        #15.modules_en
        modules_en = response.xpath('//*[@id="modular-structure"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en).replace('▼', '')
        # modules_en = clear_space_str(modules_en)
        # print(modules_en,url)

        #16.assessment_en
        assessment_en = response.xpath(
            "//h3[contains(text(),'Assessment')]//following-sibling::p[1]"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # assessment_en = clear_space_str(assessment_en)
        # print(assessment_en)

        #17.career_en
        career_en = response.xpath(
            '//*[@id="career-opportunities"]/div').extract()
        career_en = ''.join(career_en)
        # career_en = clear_space_str(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #18.ielts,19.20.21.22
        # if 'LLB' in degree_name:
        #     ielts = 6.5
        #     ielts_r = 6.0
        #     ielts_l = 6.0
        #     ielts_w = 6.0
        #     ielts_s = 6.0
        # elif 'BA Translation Year 2 entry' in degree_name:
        #     ielts = 6.5
        #     ielts_r = 6.0
        #     ielts_l = 6.0
        #     ielts_w = 6.0
        #     ielts_s = 6.0
        # elif 'BA Translation Year 3 entry' in degree_name:
        #     ielts = 7
        #     ielts_r = 6.5
        #     ielts_l = 6.5
        #     ielts_w = 6.5
        #     ielts_s = 6.5
        # elif 'BSc Biomedical Science (Leading to MD)' in degree_name:
        #     ielts = 7
        #     ielts_r = 6.5
        #     ielts_l = 6.5
        #     ielts_w = 6.5
        #     ielts_s = 6.5
        # else:
        #     ielts = 6
        #     ielts_r = 5.5
        #     ielts_l = 5.5
        #     ielts_w = 5.5
        #     ielts_s = 5.5

        #23.require_chinese_en
        require_chinese_en = "https://www.londonmet.ac.uk/international/international-admissions/application-guidance-and-entry-criteria/academic-entry-requirements-by-country/non-eueea-countries/china/"

        #24.apply_proces_en
        apply_proces_en = '<p>Stage 1: choosing your course The first step for you as a new applicant is to choose the course you wish to undertake. If you have any questions at this stage you can contact our international recruitment team who will be happy to assist you and provide information about our courses. You can begin a conversation about a course you are interested in by emailing our recruitment team at: [email protected]. We often have representatives of London Metropolitan University visiting countries all around the world. You can find out the latest planned trips to see if we will be visiting near you, here: Meet us overseas  Stage 2: applying for your course Once you have decided on your course you need to submit an application as soon as possible making sure you observe the international application deadlines. The method of application depends on the type of course you are applying for. The application methods available for each course are listed on the course page. You should check these details carefully to avoid any delay in your application reaching us.You should observe our international application guidance before submitting an application. Please see here: International application advice Stage 3: awaiting and responding to your offer Once the University receives your application you will receive a communication from us acknowledging this. You will also obtain your London Metropolitan University application ID and details about using, the applicant portal (Evision). At this point your application will enter the pending decision/consideration stage, and we will communicate with you again, either to request more information (such as a qualification transcript, portfolio, or piece of written work) for assessment, or to advise you of our decision.If you are successful in receiving an offer from us you will receive a communication detailing a conditional or unconditional offer, and this will contain further information and instruction. If your application is unsuccessful we will also contact you advising you of this, and our reasons for the decision. You can find out more about offers here: Information and advice for offer holders.Stage 4: Immigration and enrolment After obtaining an unconditional offer you will need to focus on making preparations to join the university and your arrangements to come to the UK (if you are not already here). You will receive further information about when and where to arrive, and how to attend your course enrolment closer to the enrolment period of your course.You should be considering your accommodation and finances as soon as possible before the start of term, and you should also be aware of, and be prepared to meet, any immigration requirements such as obtaining a student visa at the earliest opportunity. You can find a variety of information about moving to London here: Immigration and Arrival Advice: New Students.</p>'

        #26.tuition_fee_pre
        tuition_fee_pre = '£'

        item['apply_pre'] = apply_pre
        item['apply_documents_en'] = apply_documents_en
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        # item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['start_date'] = start_date
        item['duration'] = duration
        item['tuition_fee'] = tuition_fee
        # item['location'] = location
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        # item['ielts'] = ielts
        # item['ielts_r'] = ielts_r
        # item['ielts_w'] = ielts_w
        # item['ielts_s'] = ielts_s
        # item['ielts_l'] = ielts_l
        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['tuition_fee_pre'] = tuition_fee_pre
        item['alevel'] = alevel
        item['ucascode'] = ucascode
        yield item

示例#7

显示文件

文件： UniversityofHull_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Hull'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath('//*[@id="main-content"]/header/div[2]/div[1]/h1|//*[@id="main-content"]/section[1]/div[2]/div/div/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en,url)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = response.xpath('//*[@id="main-content"]/header/div[2]/div[1]/p[2]/span[2]').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name,url)

        #6.start_date
        start_date = '2019-9'

        #7.ucascode
        ucascode = response.xpath('//*[@id="main-content"]/header/div[2]/div[2]/div/div[3]/span').extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode).strip()
        ucascode = clear_space_str(ucascode)
        # print(ucascode,url)

        #8.apply_desc_en
        apply_desc_en = response.xpath('//*[@id="entry"]/div/div[1]').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)


        #9.overview_en
        overview_en = response.xpath('//*[@id="about"]/div/div[1]/p').extract()
        overview_en =''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #10.tuition_fee
        tuition_fee = response.xpath("//*[contains(text(),'Fees and funding')]//following-sibling::*").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee =getTuition_fee(tuition_fee)
        # print(tuition_fee)
        #
        #11.tuition_fee_pre
        tuition_fee_pre = '£'

        #12.modules_en
        # modules_en = response.xpath("//*[contains(text(),'odules')]/../following-sibling::*//li//p").extract()
        # if len(modules_en)==0:
        #     modules_en = response.xpath('//*[@id="study"]//p/strong').extract()
        # modules_en = ''.join(modules_en)
        # modules_en = remove_class(modules_en)
        modules_en = response.xpath('//*[@id="study"]/div/div[1]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)

        # print(modules_en)

        #13.ib
        ib = response.xpath("//*[contains(text(),'Alternative qualifications')]/../following-sibling::*//li[1]").extract()
        ib = ''.join(ib)
        ib = remove_tags(ib)
        # print(ib)



        #14.career_en
        career_en = response.xpath("//*[contains(text(),'Future prospects')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #15.require_chinese_en
        require_chinese_en = 'https://www.hull.ac.uk/choose-hull/study-at-hull/international/country/china.aspx'

        #16.ielts 17181920
        ielts_list = response.xpath("//*[contains(text(),'International students')]//following-sibling::*").extract()
        ielts_list = ''.join(ielts_list)
        try:
            ielts= re.findall('\d\.\d',ielts_list)
        except:
            ielts = None
        if len(ielts) ==2:
            a = ielts[0]
            b = ielts[1]
            ielts = a
            ielts_l = b
            ielts_r = b
            ielts_s = b
            ielts_w = b
        else:
            ielts = 6.0
            ielts_l = 5.5
            ielts_r = 5.5
            ielts_s = 5.5
            ielts_w = 5.5
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #21.duration
        # try:
        #     ab = response.xpath("//div[@class='kis-widget']//@data-institution").extract()[0]
        # except:
        #     ab = ''
        # try:
        #     cd = response.xpath("//div[@class='kis-widget']//@data-course").extract()[0]
        # except:
        #     cd = ''
        # if len(ab)!= 0:
        #     duration_url = 'https://widget.unistats.ac.uk/Widget/'+str(ab)+'/'+str(cd)+'/small/en-GB/Full Time'
        # else:duration_url= ''
        # # print(duration_url)
        # if len(duration_url)!=0:
        #     headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
        #     data = requests.get(duration_url, headers=headers)
        #     response_duration = etree.HTML(data.text)
        #     duration = response_duration.xpath('//*[@id="kisWidget"]/div[2]/p[1]//text()')
        #     duration = ''.join(duration)
        #     duration =remove_tags(duration)
        #     try:
        #         duration = re.findall(r'\d',duration)[0]
        #     except:
        #         duration = ''
        # else:
        #     duration = ''
        # print(duration)

        #23.apply_pre
        apply_pre = '£'

        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['start_date'] = start_date
        item['overview_en'] = overview_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['modules_en'] = modules_en
        item['ib'] = ib
        item['career_en'] = career_en
        item['require_chinese_en'] = require_chinese_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        # item['duration'] = duration
        item['apply_desc_en'] = apply_desc_en
        item['ucascode'] = ucascode
        yield  item

示例#8

显示文件

文件： RoyalAgriculturalUniversity_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Royal Agricultural University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="site"]//div[1]/div//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        if '(Hons) ' in programme_en:
            programme_en = programme_en.replace('(Hons) ', '')
        degree_name = programme_en.split()[0]
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(degree_name)
        # print(programme_en)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="course-overview"]/div[1]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.ucascode
        ucascode = response.xpath(
            '//*[@id="site"]/div/main/div/div/div[2]/div/div/div/h3').extract(
            )
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #8.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Modules')]//following-sibling::ul/li"
        ).extract()
        modules_en = '\n'.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #9.apply_desc_en
        apply_desc_en = response.xpath(
            '//*[@id="course-requirements"]/div[1]').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)

        #10.tuition_fee
        tuition_fee = response.xpath(
            '//*[@id="course-fees"]/div[1]/table[1]/tbody/tr[1]/td[3]'
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #11.tuition_fee_pre
        tuition_fee_pre = '£'

        #12.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Prospects')]//following-sibling::*").extract(
            )
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #13.apply_proces_en
        apply_proces_en = response.xpath(
            "//*[contains(text(),'Apply now')]//following-sibling::div[1]"
        ).extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en).strip()
        # print(apply_proces_en)

        #14.start_date
        start_date = '2018-9-1'

        #15.assessment_en
        assessment_en = '<p>During your undergraduate degree, you probably became familiar with many of the methods of delivery and study that we expect you to continue with during your postgraduate course. It is expected that you come already equipped with the basics in academic study, such as the ability to find, evaluate, manage, present and critique research or industry relevant output. There is a greater emphasis on independence and individual contribution towards the topics covered, and so the expectation is that students will actively participate in class-based activities from the outset. Giving presentations, critiquing case studies, using peer-to-peer feedback, working in groups on topical problems and justifying opinions based on the evidence is the norm for postgraduate study. It is not uncommon for students to arrive at a particular postgraduate qualification with very diverse backgrounds, qualifications and experience and we welcome these different perspectives in the classroom to bring a debate alive, however, it does require the student to take responsibility for their own subject knowledge gaps and motivate themselves to fill them. Of course, there will be support and guidance provided for good sources of information, however, it is not expected that these gaps will be specifically addressed within the taught sessions.For most postgraduate programmes group sizes are in the range of between 20 – 100 depending on the course and electives chosen (if relevant). However, alongside the lectures are small group seminars and tutorials where you will have the opportunity to explore key concepts in more detail, discuss topical issues relating to the key themes and undertake practical activities that help set the theories in context. To compliment the lectures and seminars, there may also be practical sessions, laboratory classes, off-site visits, case studies, guest speakers and field trips that are included in your timetabled activities depending on the modules you are studying.</p>'

        #16.deadline
        deadline = '2018-11,2019-5'

        #17.require_chinese_en
        require_chinese_en = '<p>International Foundation Year We run an International Foundation Year programme in partnership with our partner, INTO London World education Centre based in London.  To enquire about the programme please get in touch with our admissions team: [email protected] Undergraduate Degrees (Bachelors) Senior Secondary School Graduation certificate 高中毕业证书 with overall grade B or higher (to include Maths) Plus Gao Kao – Chinese University/College Entrance examination (高考) with good grades OR completion of a recognised International Foundation course with overall grade 60% or above OR successful completion of 1 year of University degree with a minimum of 60%.And IELTS band score 6.0 overall or above with no less than 5.5 in each component of the academic IELTS test. (The test must have been taken within two years of the start of the course). =Academic transfers to RAU into Years 2 and 3 are possible. For more information contact [email protected]</p>'

        #18.ielts 19202122
        ielts = 6.0
        ielts_s = 5.5
        ielts_w = 5.5
        ielts_l = 5.5
        ielts_r = 5.5

        #19.apply_pre
        apply_pre = '£'

        #20.alevel
        alevel = response.xpath(
            '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_class(alevel)
        # print(alevel)

        #21.duration
        duration = 3

        #22.ib
        ib = response.xpath(
            '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[4]').extract()
        ib = ''.join(ib)
        ib = remove_class(ib)
        # print(ib)

        item['ib'] = ib
        item['duration'] = duration
        item['alevel'] = alevel
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['apply_desc_en'] = apply_desc_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['career_en'] = career_en
        item['apply_proces_en'] = apply_proces_en
        item['start_date'] = start_date
        item['assessment_en'] = assessment_en
        item['deadline'] = deadline
        item['require_chinese_en'] = require_chinese_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['ucascode'] = ucascode
        yield item

示例#9

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Sheffield Hallam University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath("/html/body/section[1]//h1").extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = response.xpath(
            '/html/body/section[1]/div/div[2]/span').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'What is the fee?')]//following-sibling::*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #7.tuition_fee_pre
        tuition_fee_pre = '£'

        #8.duration
        duration_list = response.xpath(
            "//*[contains(text(),'How long will I study?')]//following-sibling::*"
        ).extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        try:
            duration = re.findall('\d+', duration_list)[0]
        except:
            duration = 1
        # print(duration_list)
        if int(duration) > 5:
            duration_per = 3
        else:
            duration_per = 1
        # print(duration,'*********',duration_per)

        #9.location
        location = 'Sheffield'

        #10.teach_time
        teach_time = response.xpath('/html/body/section[1]//span[1]').extract()
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full-time' in teach_time:
            teach_time = 'Full-time'
        else:
            teach_time = 'Part-time'
        # print(teach_time)

        #11.overview_en
        overview_en = response.xpath(
            "//*[contains(text(),'Course summary')]//following-sibling::*"
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #12.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Future careers')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #13.rntry_requirements
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #14.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Compulsory modules')]/../following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #15.apply_proces_en
        apply_proces_en = response.xpath(
            '//*[@id="apply-now"]/div[1]//a/@href').extract()
        apply_proces_en = ''.join(apply_proces_en)
        # print(apply_proces_en)

        #16.duration_per
        duration_per = 1

        #17.ielts 18192021
        ielts_list = re.findall(r'[567]\.\d', rntry_requirements)
        # print(ielts_list,response.url)
        if len(ielts_list) != 0:
            a = ielts_list[0]
            b = ielts_list[1]
            ielts = a
            ielts_r = b
            ielts_l = b
            ielts_s = b
            ielts_w = b
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_l = 6.0
            ielts_s = 6.0
            ielts_w = 6.0

        #22.require_chinese_en
        require_chinese_en = '<p>The following qualifications from China will be considered for entry on to postgraduate taught programmes, with a usual minimum average of 60 per cent Four year Bachelor Degree from a recognised university Three year university diploma plus relevant work experience Successful completion of a recognised pre-masters course</p>'
        #23.apply_fre
        apply_pre = '£'
        #24.start_date
        start_date = response.xpath(
            "//*[contains(text(),'When do I start?')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        # print(start_date)
        if 'September, January' in start_date:
            start_date = '2018-9,2019-1'
        elif 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = translate_month(start_date)
            start_date = '2018-' + str(start_date)
        # print(start_date)

        item['start_date'] = start_date
        item['apply_pre'] = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['duration'] = duration
        item['location'] = location
        item['teach_time'] = teach_time
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['rntry_requirements'] = rntry_requirements
        item['modules_en'] = modules_en
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per
        yield item

示例#10

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Bournemouth University'
        # print(university)

        #2.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::p").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #3.programme_en 4.degree_name
        programme_en = response.xpath('/html/body/div/section//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        try:
            degree_name = programme_en.split()[0]
        except:
            degree_name = ''
        if '-' in programme_en:
            programme_en = programme_en.replace('-', '')
        programme_en = programme_en.replace(degree_name, '')
        programme_en = clear_space_str(programme_en)
        if '–' in programme_en:
            programme_en = programme_en.replace('–', '').strip()
        programme_en = programme_en.replace('&amp;', '')
        # print('programme_en:',programme_en)
        # print('degree_name:',degree_name)

        # 5.degree_type
        degree_type = 2

        #6.teach_time
        teach_time = response.xpath(
            "//*[contains(text(),'Delivery:')]//following-sibling::*").extract(
            )
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full time' in teach_time:
            teach_time = 'Full time'
        else:
            teach_time = 'Part time'
        # print(teach_time)

        #7.duration #8.duration_per
        duration = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::p").extract(
            )
        duration = ''.join(duration)
        duration = remove_tags(duration)
        # print(duration)
        if '1 year' in duration:
            duration = 1
            duration_per = 1
        elif '12-18 months' in duration:
            duration = 12
            duration_per = 3
        elif '36 months' in duration:
            duration = 36
            duration_per = 3
        elif '1 to 2 years' in duration:
            duration = 1
            duration_per = 1
        elif '2 years' in duration:
            duration = 2
            duration_per = 1
        elif '3-5 years' in duration:
            duration = 3
            duration_per = 1
        elif '48 months' in duration:
            duration = 48
            duration_per = 3
        elif '18-36 months' in duration:
            duration = 18
            duration_per = 3
        elif '12 months' in duration:
            duration = 12
            duration_per = 3
        elif '5 years' in duration:
            duration = 5
            duration_per = 1
        elif '3 years' in duration:
            duration = 3
            duration_per = 1
        elif '14 months' in duration:
            duration = 14
            duration_per = 3
        elif '15 months' in duration:
            duration = 15
            duration_per = 3
        elif '18-24 months' in duration:
            duration = 18
            duration_per = 3
        elif '27 months' in duration:
            duration = 27
            duration_per = 3
        elif '8 months' in duration:
            duration = 8
            duration_per = 3
        elif 'Nine months' in duration:
            duration = 9
            duration_per = 3
        else:
            duration_per = 1
            duration = 1
        # print('duration_per:',duration_per)
        # print('duration:',duration)

        #9.overview_en
        overview_en = response.xpath(
            '//*[@id="main-content"]/div/section[2]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.teach_time
        teach_time = 'full time'

        #11.modules_en
        modules_en = response.xpath(
            "//section[@id='course-details']//div[@id='accordion-1']").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #12.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Next start date:')]//following-sibling::p"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #13.rntry_requirements
        rntry_requirements = response.xpath(
            "//*[contains(text(),'Entry requirements')]/../following-sibling::div[1]"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        # print(rntry_requirements,'******************************************************************************')

        #14.ielts 15.16.17.18
        ielts_list = re.findall('\d\.\d', rntry_requirements)
        # print(ielts_list)
        if len(ielts_list) == 4:
            ielts = ielts_list[2]
            ielts_l = ielts_list[3]
            ielts_s = ielts_list[3]
            ielts_r = ielts_list[3]
            ielts_w = ielts_list[3]
        elif len(ielts_list) == 3:
            ielts = ielts_list[1]
            ielts_l = ielts_list[2]
            ielts_s = ielts_list[2]
            ielts_r = ielts_list[2]
            ielts_w = ielts_list[2]
        elif len(ielts_list) == 2:
            ielts = ielts_list[0]
            ielts_l = ielts_list[1]
            ielts_s = ielts_list[1]
            ielts_r = ielts_list[1]
            ielts_w = ielts_list[1]
        elif len(ielts_list) == 1:
            ielts = ielts_list[0]
            ielts_l = ielts_list[0]
            ielts_s = ielts_list[0]
            ielts_r = ielts_list[0]
            ielts_w = ielts_list[0]
        else:
            ielts = None
            ielts_l = None
            ielts_s = None
            ielts_r = None
            ielts_w = None
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #19.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Careers')]/../following-sibling::*|//*[contains(text(),'Careers')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #20.tuition_fee,#21.tuition_fee_pre
        tuition_fee_list = response.xpath(
            '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]'
        ).extract()
        tuition_fee_list = ''.join(tuition_fee_list)
        #
        # if len(tuition_fee) == 0:
        #     tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract()
        # tuition_fee = ''.join(tuition_fee)
        # tuition_fee = remove_tags(tuition_fee)
        # tuition_fee = tuition_fee.replace(',','')
        # tuition_fee = tuition_fee.replace('£','')
        # print(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee_list)
        # print(tuition_fee)
        tuition_fee_pre = '£'

        #22.url
        url = response.url
        # print(url)

        #23.application_open_date
        application_open_date = '2018-7-18'
        #24.apply_pre
        apply_pre = '£'

        #25.apply_fee
        apply_fee = 0
        #26.apply_proces_en
        apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>'
        #27.require_chinese_en

        require_chinese_en = "<p>This is a guide to the normal entry requirements, assuming you’ve followed the Chinese education system. An admissions tutor will study your application, so make sure you include your academic background and personal information when you apply.Entry requirements vary depending on what sort of course you’re coming to BU to study. BU International College Foundation Certificates You can undertake a Foundation Certificate before going on to an undergraduate course if you’ve completed 11 years of schooling or Senior High School Year 2 in China and have a minimum of IELTS (Academic) 5.0. Undergraduate courses You can apply to study a Bachelor's degree from year one if you hold a Chinese Senior High School Diploma plus successful completion of a relevant first-year undergraduate programme in a recognised Chinese university, or a Diploma from Specialized College (zhongzhuan). Chinese Senior High School certificate of graduation with overall HuiKao result grade B average,  transcripts of 3 years with 85% average (85% also eligible for AES). Top-up courses You need to hold a College Graduation Diploma (Dazhuan awarded by a university/college on completion of two to three years of study), or a BTEC Higher National Diploma or Foundation degree in a relevant subject.Postgraduate courses You need to have a Bachelor's (Honours) degree from a recognised Chinese university, normally from a four-year undergraduate programme, or a Bachelors degree from Higher Education Self-Study Examinations, or a Top-up degree or university-recognised Pre-Master’s Foundation programme. Grade requirements from Chinese Bachelor's degree holders are as below: Applicants from 985 or 211 universities Media studies and other subjects equivalent to UK 2:1 degree	65% +	GPA 2.25 + Business and subjects equivalent to UK 2:2 degree	60% +	GPA 2.0 + Academic Excellence Scholarship (automatic award of £3500)	75% +	GPA 2.75 + Applicants from other universities Media studies and other subjects equivalent to UK 2:1 degree	70% +	GPA 2.5 + Business and subjects equivalent to UK 2:2 degree	65% +	GPA 2.25 + Academic Excellence Scholarship (automatic award of £3500)	80% +	GPA 3.0 + Research programmes You need a good postgraduate degree to be considered for a BU research programme. Please see more detail on the postgraduate research page.You can find more information about English language requirements for entry to BU on our English language requirements page. Full information about preparatory courses is available on the Bournemouth University International College website.If you need help with your visa or want more information about the immigration process, you can find it on our immigration information page.</p>"

        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['location'] = location
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['teach_time'] = teach_time
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['url'] = url
        item['application_open_date'] = application_open_date
        yield item

示例#11

显示文件

文件： SwinburneUniversityofTechnology_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Swinburne University of Technology'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.department
        try:
            department = response.xpath(
                '//*[@id="content"]/main/section[1]/div[2]/div/comment()'
            ).extract()
            department = ''.join(department).replace('Faculty', '')
            department = clear_space_str(department)
            department = remove_tags(department).replace(' -->', '').strip()
            department = 'Faculty ' + department
            # print(department)
        except:
            department = 'N/A'

        #4.programme_en
        try:
            programme_en = response.xpath(
                '//*[@id="content"]/main/section[1]/header/div[1]/h1').extract(
                )[0]
            programme_en = remove_tags(programme_en)
            programme_en = programme_en.replace('Master of ', '').strip()
        except:
            programme_en = 'N/A'
        if '(International)' in programme_en:
            programme_en = programme_en
        elif ' (Professional)' in programme_en:
            programme_en = programme_en
        elif '(Advanced)' in programme_en:
            programme_en = programme_en
        elif '(Executive)' in programme_en:
            programme_en = programme_en
        else:
            if '(' in programme_en:
                programme_en = re.findall(r'\((.*)\)', programme_en)[0]
            else:
                programme_en = programme_en
        # print(programme_en)

        #5.degree_name
        degree_name = response.xpath(
            '//*[@id="content"]/main/section[1]/header/div[1]/h1').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.start_date
        start_date = '2,7'

        #7.degree_overview_en
        try:
            degree_description = response.xpath(
                '//*[@id="content"]/main/section[1]/div[2]/div[1]').extract()
            degree_description = ''.join(degree_description)
            degree_description = remove_class(degree_description)
            degree_overview_en = degree_description
            # print(degree_description)
        except:
            degree_overview_en = 'N/A'

        #8.apply_pre
        apply_pre = 'A$'

        #9.duration
        try:
            duration = response.xpath(
                '//h3[contains(text(),"Duration")]/following-sibling::p'
            ).extract()[0]
            duration = remove_tags(duration)
            # duration=re.findall('\d\.?\d?',duration)
            # duration=''.join(duration)
            # print(duration)
        except:
            duration = 'N/A'
        # print(duration)

        # 10.modules_en
        try:
            modules_en = response.xpath(
                '//h3[contains(text(),"Course description")]/following-sibling::div'
            ).extract()
            modules_en = ''.join(modules_en)
            modules_en = remove_class(modules_en)
        except:
            modules_en = 'N/A'
        # print(modules_en)

        # 11.career_en
        try:
            career_en = response.xpath(
                '//h3[contains(text(),"Career")]/following-sibling::div[1]//text()'
            ).extract()
            career_en = ''.join(career_en).strip()
            career_en = '<p>' + career_en + '</p>'
            #
        except:
            career_en = ''
        # print(career_en)

        # 12.tuition_fee
        try:
            tuition_fee = response.xpath(
                '//h3[contains(text(),"Fee")]/following-sibling::p').extract(
                )[0]
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = re.findall('\$\d{4,6}', tuition_fee)
            tuition_fee = ''.join(tuition_fee).replace('$', '')
            # print(tuition_fee,response.url)
        except:
            tuition_fee = 0

        #13.rntry_requirements_en
        try:
            rntry_requirements_en = response.xpath(
                '//h3[contains(text(),"Entry requirements")]/following-sibling::div'
            ).extract()[0]
            rntry_requirements_en = remove_class(rntry_requirements_en)
            #
        except:
            rntry_requirements_en = 'N/A'
        # print(rntry_requirements_en)

        # 14.ielts 15161718 19.toefl 20212223
        ielts_text = response.xpath(
            '//h3[contains(text(),"English language requirements")]/following-sibling::div'
        ).extract()
        ielts_text = ''.join(ielts_text)
        # print(ielts_text)
        ielts = re.findall('[567]\.\d', ielts_text)
        ielts = ''.join(ielts)
        toefl = re.findall(
            'score of [6-9]\d[\sa-zA-Z\,]*[0-2]\d[\sa-zA-Z]*[0-2]?\d?',
            ielts_text)
        toefl = ''.join(toefl)
        # print(ielts)
        toefls = re.findall('\d{2}', toefl)
        # print(toefls)
        if len(toefls) == 3:
            toefl = toefls[0]
            toefl_r = toefls[1]
            toefl_w = toefls[1]
            toefl_s = toefls[2]
            toefl_l = toefls[2]
        else:
            toefl_r = None
            toefl_w = None
            toefl_s = None
            toefl_l = None
        ieltss = re.findall('\d.\d', ielts)
        if ieltss:
            ielts = max(ieltss)
            ielts_l, ielts_s, ielts_r, ielts_w = min(ieltss), min(ieltss), min(
                ieltss), min(ieltss)
        else:
            ielts_l, ielts_s, ielts_r, ielts_w = '', '', '', ''
        # 检查后面的托福成绩toefl =response.xpath('//h3[contains(text(),"English language requirements")]/following-sibling::div').extract()
        # print(toefl,toefl_r,toefl_w,toefl_s,toefl_l)
        # print(ielts,ielts_w,ielts_r,ielts_s,ielts_l)
        # ielts = ''
        # toefl = ''
        # ielts_l, ielts_s, ielts_R, ielts_w ='','','',''
        # toefl_r = ''
        # toefl_w = ''

        #24.apply_proces_en
        apply_proces_en = [
            "Before you start your application, make sure you have followed these important steps. After checking these details, you will be ready to start your application to study at Swinburne.",
            "You can also read about the Australian Government’s Education Services for Overseas Students (ESOS) regulatory framework so that you understand your rights and responsibilities as an international student before and during your study.",
            "1. Check that you are an international student",
            "2. Select your course", "3. Check entry requirements",
            "4. Check to see if you are eligible for credit",
            "5. Review tuition fees",
            "6. Compile education and employment history",
            "7. Prepare your documents", "8. Begin your application"
        ]
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = '<p>' + apply_proces_en + '</p>'

        # 25.apply_desc_en
        apply_desc_en = [
            "You may also be required to submit documents to support your application.",
            "certified academic documents", "certified copy of your passport",
            "English proficiency test results",
            "certified copy of unit outlines and academic transcripts",
            "portfolio (for most design courses)",
            "English translations of all documents, if not already in English"
        ]
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = '<p>' + apply_desc_en + '</p>'

        #26.url
        url = response.url

        #27.location
        location = 'Hawthorn'

        #28.tuition_fee_pre
        tuition_fee_pre = 'A$'

        #29.degree_type
        degree_type = 2

        item['tuition_fee_pre'] = tuition_fee_pre
        item['degree_type'] = degree_type
        item['university'] = university
        item['url'] = url
        item['department'] = department
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['start_date'] = start_date
        item['degree_overview_en'] = degree_overview_en
        item['duration'] = duration
        item['apply_pre'] = apply_pre
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['rntry_requirements_en'] = rntry_requirements_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['apply_proces_en'] = apply_proces_en
        item['apply_desc_en'] = apply_desc_en
        item['url'] = url
        item['location'] = location
        yield item

示例#12

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Bournemouth University'
        # print(university)

        #2.location
        location = response.xpath(
            "//*[contains(text(),'Location:')]//following-sibling::p").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #3.programme_en 4.degree_name
        programme_en = response.xpath('/html/body/div/section//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        try:
            degree_name = programme_en.split()[0]
        except:
            degree_name = ''
        if '-' in programme_en:
            programme_en = programme_en.replace('-', '')
        programme_en = programme_en.replace(degree_name, '')
        programme_en = clear_space_str(programme_en)
        if '–' in programme_en:
            programme_en = programme_en.replace('–', '').strip()
        programme_en = programme_en.replace('&amp;', '').replace('(Hons)',
                                                                 '').strip()
        # print('programme_en:',programme_en)
        # print('degree_name:',degree_name)

        #5.degree_type
        degree_type = 1

        #6.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS Code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        # print(ucascode)

        #7.duration #8.duration_per
        duration_a = response.xpath(
            "//*[contains(text(),'Duration:')]//following-sibling::p").extract(
            )
        duration_a = ''.join(duration_a)
        duration_a = remove_tags(duration_a)
        # print(duration)
        if 'Four years' in duration_a:
            duration = 4
            duration_per = 1
        else:
            duration = re.findall('\d', duration_a)[0]
            duration_per = 1
        # print('duration_per:',duration_per)
        # print('duration:',duration)

        #9.overview_en
        overview_en = response.xpath(
            '//*[@id="main-content"]/div/section[3]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.alevel
        try:
            alevel_list = response.xpath(
                "//*[contains(text(),'GCSEs')]//preceding-sibling::p").extract(
                )[1]
            alevel = ''.join(alevel_list)
            alevel = remove_tags(alevel)
        except:
            alevel = 'N/A'
        # print(alevel)

        #11.modules_en
        modules_en = response.xpath(
            "//section[@id='course-details']//div[@id='accordion-1']").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #12.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Next start date:')]//following-sibling::p"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #13.ib
        ib = response.xpath(
            "//*[contains(text(),'International Baccalaureate')]/..").extract(
            )
        ib = ''.join(ib)
        ib = remove_tags(ib)
        if len(ib) > 500:
            ib = ib[:500]

        # print(ib)

        #14.ielts 15.16.17.18
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        ielts_list = re.findall('\d\.\d', rntry_requirements)
        # print(ielts_list)
        if len(ielts_list) == 4:
            ielts = ielts_list[2]
            ielts_l = ielts_list[3]
            ielts_s = ielts_list[3]
            ielts_r = ielts_list[3]
            ielts_w = ielts_list[3]
        elif len(ielts_list) == 3:
            ielts = ielts_list[1]
            ielts_l = ielts_list[2]
            ielts_s = ielts_list[2]
            ielts_r = ielts_list[2]
            ielts_w = ielts_list[2]
        elif len(ielts_list) == 2:
            ielts = ielts_list[0]
            ielts_l = ielts_list[1]
            ielts_s = ielts_list[1]
            ielts_r = ielts_list[1]
            ielts_w = ielts_list[1]
        elif len(ielts_list) == 1:
            ielts = ielts_list[0]
            ielts_l = ielts_list[0]
            ielts_s = ielts_list[0]
            ielts_r = ielts_list[0]
            ielts_w = ielts_list[0]
        else:
            ielts = None
            ielts_l = None
            ielts_s = None
            ielts_r = None
            ielts_w = None
        # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s)

        #19.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Careers')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #20.tuition_fee,#21.tuition_fee_pre
        tuition_fee_list = response.xpath(
            '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]'
        ).extract()
        tuition_fee_list = ''.join(tuition_fee_list)
        #
        # if len(tuition_fee) == 0:
        #     tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract()
        # tuition_fee = ''.join(tuition_fee)
        # tuition_fee = remove_tags(tuition_fee)
        # tuition_fee = tuition_fee.replace(',','')
        # tuition_fee = tuition_fee.replace('£','')
        # print(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee_list)
        # print(tuition_fee)
        tuition_fee_pre = '£'

        #22.url
        url = response.url
        # print(url)

        #23.application_open_date
        application_open_date = '2018-7-18'
        #24.apply_pre
        apply_pre = '£'

        #25.apply_fee
        apply_fee = 0
        #26.apply_proces_en
        apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>'

        #27.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'How you will be assessed')]//following-sibling::p|//*[@id='accordion-1']/div[6]"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en,url)

        item['assessment_en'] = assessment_en
        item['alevel'] = alevel
        item['ib'] = ib
        item['ucascode'] = ucascode
        item['apply_proces_en'] = apply_proces_en
        item['apply_fee'] = apply_fee
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['location'] = location
        item['programme_en'] = programme_en
        item['degree_name'] = degree_name
        item['degree_type'] = degree_type
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['url'] = url
        item['application_open_date'] = application_open_date
        yield item

示例#13

显示文件

文件： TheUniversityofAdelaide_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'The University of Adelaide'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en_1 = response.xpath('//*[@id="ua-main-content"]/h2/text()').extract()
        programme_en_1 = ''.join(programme_en_1)
        programme_en = remove_tags(programme_en_1).replace('Master of ','')
        if  '(' in programme_en:
            programme_en = re.findall(r'\((.*)\)',programme_en)[0]
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = programme_en_1.split('(')[0].strip() if '(' in programme_en_1 else programme_en_1.strip()
        # print(degree_name)

        #6.teach_time
        teach_time = 'coursework'

        #7.duration #8.duration_per
        duration_list = response.xpath('//*[@id="ua-main-content"]/div[2]/div[3]/span[2]').extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        duration_list = clear_space_str(duration_list)
        # print(duration_list)
        if '1.5'in duration_list:
            duration = 1.5
        else:
            try:
                duration = re.findall('\d',duration_list)[0]
            except:
                duration = None
        duration_per = 1
        # print(duration)
        # print(duration_list)

        #9.location
        location = response.xpath('//*[@id="ua-main-content"]/div[2]/div[1]/span[2]/a').extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)
        if '2019/hd' in response.url:
            location = 'North Terrace Campus'
        elif len(location) ==0:
            location = ''
        # print(location)

        #10.overview_en
        overview_en = response.xpath('//*[@id="ua-main-content"]/div[2]/div/div').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #11.ielts 12131415
        ielts_list = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[2]//tr[2]/td/table//tr/td').extract()
        # ielts_list = ''.join(ielts_list)
        # ielts_list = remove_tags(ielts_list)
        # print(ielts_list)

        #ielts
        try:
            if '7' in ielts_list[1]:
                ielts = 7
            else:
                try:
                    ielts = re.findall('\d\.\d',ielts_list[1])[0]
                except:
                    ielts = None
        except:
            ielts = 7

        #ielts_r
        try:
            if '6.5' in ielts_list[2]:
                ielts_r = 6.5
            else:
                try:
                    ielts_r = re.findall('\d',ielts_list[2])[0]
                except:
                    ielts_r = None
        except:
            ielts_r = 6.5
        # print(ielts_r)

        #ielts_l
        try:
            if '6.5' in ielts_list[3]:
                ielts_l = 6.5
            else:
                try:
                    ielts_l = re.findall('\d', ielts_list[3])[0]
                except:
                    ielts_l = None
        except:
            ielts_l = 6.5
        # print(ielts_l)

        # ielts_s
        try:
            if '6.5' in ielts_list[4]:
                ielts_s = 6.5
            else:
                try:
                    ielts_s = re.findall('\d', ielts_list[4])[0]
                except:
                    ielts_s = None
        except:
            ielts_s = 6.5
        # print(ielts_s)

        # ielts_w
        try:
            if '6.5' in ielts_list[5]:
                ielts_w = 6.5
            else:
                try:
                    ielts_w = re.findall('\d', ielts_list[5])[0]
                except:
                    ielts_w = None
        except:
            ielts_w = 6.5
        # print(ielts_w)

        #16.toefl 17181920
        toefl_list = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[2]//tr[3]/td/table//tr/td').extract()
        toefl_list = ''.join(toefl_list)
        toefl_list = remove_tags(toefl_list)
        # print(toefl_list)
        try:
            toefl = re.findall('\d+',toefl_list)
            # print(toefl)
            a = toefl[0]
            b = toefl[1]
            c = toefl[2]
            d = toefl[3]
            e = toefl[4]
            toefl = a
            toefl_r = b
            toefl_l = c
            toefl_s = d
            toefl_w = e
        except:
            toefl = 94
            toefl_r = 24
            toefl_l = 24
            toefl_s = 23
            toefl_w = 27
        # print(toefl, toefl_r, toefl_l, toefl_s, toefl_w,response.url)

        #21.rntry_requirements_en
        rntry_requirements_en = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[3]//tr/td').extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #22.apply_proces_en
        apply_proces_en ='https://international.adelaide.edu.au/admissions/how-to-apply'

        #23.deadline
        if 'Master of Psychology' in programme_en:
            deadline = '2018-10-21,2019-5-1'
        elif 'Master of Viticulture and Oenology' in programme_en:
            deadline = '2018-12-1,2019-4-30'
        else:
            deadline = '2018-12-1,2019-5-1'

        #24.tuition_fee
        tuition_fee = response.xpath('//*[@id="df-acc-fees_scholarships"]/div[5]/table//tr/td[2]').extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #25.tuition_fee_pre
        tuition_fee_pre = '$'

        #26.apply_pre
        apply_pre = '$'

        #27.career_en
        career_en = response.xpath('//*[@id="df-acc-careers_parent"]//following-sibling::*').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #28.modules_en
        modules_en = response.xpath("//h4[contains(text(),'Example Study Plan')]/following-sibling::div[1]").extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['location'] = location
        item['overview_en'] = overview_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_w'] = toefl_w
        item['toefl_l'] = toefl_l
        item['rntry_requirements_en'] = rntry_requirements_en
        item['apply_proces_en'] = apply_proces_en
        item['deadline'] = deadline
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['career_en'] = career_en
        item['apply_pre'] = apply_pre
        item['modules_en'] = modules_en
        yield  item

示例#14

显示文件

文件： UniversityOfCambridge_P.py 项目： histudent/python_spider

    def parse_rntry_requirements(self, entry_requirements_url):
        data = requests.get(entry_requirements_url, headers=self.headers)
        response = etree.HTML(data.text)
        # print(response)
        entry_requirements = response.xpath(
            "//div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']"
        )
        entry = etree.tostring(entry_requirements[0],
                               encoding='unicode',
                               pretty_print=False,
                               method='html')
        # print("************", assessment_en)
        # print(ass)
        entry = remove_class(clear_space_str(entry))
        # print(ass)

        english_dict = {}
        # 获取雅思托福分数
        ielts = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Total')]/following-sibling::*[1]//text()"
        )
        ielts_l = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Listening')]/following-sibling::*[1]//text()"
        )
        ielts_s = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Speaking')]/following-sibling::*[1]//text()"
        )
        ielts_r = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Reading')]/following-sibling::*[1]//text()"
        )
        ielts_w = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Writing')]/following-sibling::*[1]//text()"
        )
        english_dict['IELTS'] = ''.join(ielts)
        english_dict['IELTS_L'] = ''.join(ielts_l)
        english_dict['IELTS_S'] = ''.join(ielts_s)
        english_dict['IELTS_R'] = ''.join(ielts_r)
        english_dict['IELTS_W'] = ''.join(ielts_w)

        toefl = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Total')]/following-sibling::*[1]//text()"
        )
        toefl_l = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Listening')]/following-sibling::*[1]//text()"
        )
        toefl_s = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Speaking')]/following-sibling::*[1]//text()"
        )
        toefl_r = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Reading')]/following-sibling::*[1]//text()"
        )
        toefl_w = response.xpath(
            "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Writing')]/following-sibling::*[1]//text()"
        )
        english_dict['TOEFL'] = ''.join(toefl)
        english_dict['TOEFL_L'] = ''.join(toefl_l)
        english_dict['TOEFL_S'] = ''.join(toefl_s)
        english_dict['TOEFL_R'] = ''.join(toefl_r)
        english_dict['TOEFL_W'] = ''.join(toefl_w)
        # print(english_dict)
        english_dict['entry'] = entry
        return english_dict

示例#15

显示文件

文件： RoyalHollowayUniversityofLondon_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Royal Holloway University of London'
        # print(university)

        #2.department
        try:
            department = response.xpath('//*[@id="main"]/aside/div[2]/a/div[2]/span[2]').extract()
            department = ''.join(department)
            department = remove_tags(department)
            # print(department)
        except:
            department = 'N/A'

        #3.location
        location = 'London'

        #4.degree_type
        degree_type = 2

        #5.degree_name
        try:
            degree_name = response.xpath('/html/body/div[1]/main/div[1]/div/div/div/span').extract()
            degree_name = ''.join(degree_name)
            degree_name = remove_tags(degree_name)
        except:
            degree_name = 'N/A'
        # print(degree_name)

        #6.programme_en
        try:
            programme_en = response.xpath('/html/body/div[1]/main/div[1]/div/div/div/h2').extract()
            programme_en = ''.join(programme_en)
            programme_en = remove_tags(programme_en)
            programme_en = clear_space_str(programme_en)
        except:
            programme_en = ''
        # print(programme_en)

        #7.overview_en
        try:
            overview_en = response.xpath('//*[@id="main"]/article/p[1]').extract()
            overview_en = ''.join(overview_en)
            # overview_en = remove_tags(overview_en)
            overview_en = clear_space_str(overview_en)
            # print(overview_en)
        except:
            overview_en = ''

        #8.duration
        try:
            duration = response.xpath('/html/body/div[1]/main/div[2]/div/ul/li[1]/span').extract()
            duration = ''.join(duration)
            duration = re.findall('\d',duration)[0]
        except:
            duration = ''
        # print(duration)

        #9.duration_per
        duration_per = 1


        #10.modules_en
        try:
            modules_en = response.xpath('//*[@id="accordionItem1"]/div').extract()
            modules_en = ''.join(modules_en)
            modules_en = remove_class(modules_en)
            modules_en = clear_space_str(modules_en)
        except:
            modules_en = ''
        # print(modules_en)

        #11.assessment_en
        try:
            assessment_en = response.xpath('//*[@id="accordionItem2"]/div').extract()
            assessment_en = ''.join(assessment_en)
            assessment_en = remove_class(assessment_en)
            assessment_en = clear_space_str(assessment_en)
        except:
            assessment_en = ''
        # print(assessment_en)

        #12.career_en
        try:
            career_en = response.xpath('//*[@id="accordionItem4"]/div').extract()
            career_en = ''.join(career_en)
            career_en = remove_class(career_en)
            career_en = clear_space_str(career_en)
        except:
            career_en = ''
        # print(career_en)

        #13.tuition_fee
        try:
            tuition_fee = response.xpath('//*[@id="accordionItem5"]/div/p[2]').extract()
            tuition_fee = ''.join(tuition_fee)
            tuition_fee = remove_tags(tuition_fee)
            tuition_fee = re.findall('\£(\d+)', tuition_fee)[0]
        except:
            tuition_fee = 'N/A'
        # print(tuition_fee)

        #14.tuition_fee_pre
        tuition_fee_pre = '£'

        #15.rntry_requirements
        rntry_requirements = response.xpath('//*[@id="accordionItem3"]/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_tags(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        # print(rntry_requirements)

        #16.ielts
        if 'Classics' in programme_en:
            ielts = 6.5
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'theatre' in programme_en:
            ielts = 6.5
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'English' in programme_en:
            ielts = 7
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'European Studies' in programme_en:
            ielts = 6.5
            ielts_w = 6.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'History' in programme_en:
            ielts = 6.5
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Media Arts' in programme_en:
            ielts = 6.5
            ielts_w = 6.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Music' in programme_en:
            ielts = 6.5
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Economics' in programme_en:
            ielts = 6.5
            ielts_w = 6
            ielts_r = 6
            ielts_l = 6
            ielts_s = 6
        elif 'MBA' in programme_en:
            ielts = 7
            ielts_w = 6
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Management' in programme_en:
            ielts = 6.5
            ielts_w = 6
            ielts_r = 6
            ielts_l = 6
            ielts_s = 6
        elif 'Biological Sciences' in programme_en:
            ielts = 6.5
            ielts_w = 7
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Electronic Engineering' in programme_en:
            ielts = 6.5
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Physics' in programme_en:
            ielts = 6.5
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        elif 'Psychology' in programme_en:
            ielts = 6.5
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        else:
            ielts = 6.5
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5


        #21.require_chinese_en
        require_chinese_en =''

        #22.url
        url = response.url

        #23.other
        other = 'https://intranet.royalholloway.ac.uk/international/documents/pdf/internationalstudentsupport/tier-4-checklist-outside-uk.pdf'

        #24.apply_proces_en
        apply_proces_en = 'https://admissions.royalholloway.ac.uk/AP/Login.aspx'
        #25.teach_time
        teach_time = 'Full-time'


        item['teach_time'] = teach_time
        item['other'] = other
        item['apply_proces_en'] =apply_proces_en
        item['university'] = university
        item['department'] = department
        item['location'] = location
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_w'] = ielts_w
        item['ielts_r'] = ielts_r
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['require_chinese_en'] = require_chinese_en
        item['url'] = url
        yield item

示例#16

显示文件

文件： UniversityofCumbria_p.py 项目： histudent/python_spider

    def parse(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'University of Cumbria'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '/html/body/main/div[1]/header/div/h1/text()').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        programme_en = clear_space_str(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = response.xpath(
            '/html/body/main/div[1]/header/div/h1/em').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.location
        location = response.xpath(
            "//*[contains(text(),'Location')]//following-sibling::*").extract(
            )
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #7.duration #8.duration_per #9.teach_time
        duration_list = response.xpath(
            "//*[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        try:
            duration = re.findall('\d+', duration_list)[0]
        except:
            duration = 1
        if int(duration) > 5:
            duration_per = 3
        else:
            duration_per = 1
        if 'Full' in duration_list:
            teach_time = 'Full-time'
        else:
            teach_time = 'Part-time'
        # print(duration,teach_time,duration_per)

        #10.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Start date')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        if 'Various' in start_date:
            start_date = '2018-*'
        elif 'September, November 2018' in start_date:
            start_date = '2018-9,2018-11'
        elif 'September 2018, March 2019' in start_date:
            start_date = '2018-9,2019-3'
        elif 'September 2018; January 2018' in start_date:
            start_date = '2018-9,2019-1'
        elif 'April, September 2018' in start_date:
            start_date = '2018-9,2019-4'
        elif 'January, May, October 2018' in start_date:
            start_date = '2018-10,2019-1,2019-5'
        elif 'January, April, July, October 2018' in start_date:
            start_date = '2018-7,2019-1,2019-4,2019-7'
        elif 'September 2018; March 2018' in start_date:
            start_date = '2018-9,2019-3'
        elif 'June, September 2018' in start_date:
            start_date = '2018-6,2018-9'
        elif 'January, April or September 2018' in start_date:
            start_date = '2018-9,2019-1,2019-4'
        elif 'January, April, September 2018' in start_date:
            start_date = '2018-9,2019-1,2019-4'
        elif 'January 2018; September 2018' in start_date:
            start_date = '2018-9'
        elif 'October 2018; May, January 2018' in start_date:
            start_date = '2018-10,2019-1,2019-5'
        elif 'September 2018; January 2018 ' in start_date:
            start_date = '2018-9,2019-1'
        elif 'January, May, September 2018, 2019' in start_date:
            start_date = '2018-9,2019-1,2019-5'
        else:
            start_date = translate_month(start_date)
            start_date = '2018-' + str(start_date)
        # print(start_date)

        #11.modules_en
        modules_en = response.xpath(
            "//h3[contains(text(),'Modules')]//following-sibling::*[1]"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en).strip()
        # print(modules_en)

        #12.rntry_requirements
        rntry_requirements = response.xpath(
            "//h3[contains(text(),'Selection criteria')]//following-sibling::*"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #13.tuition_fee_pre
        tuition_fee_pre = '£'

        #14.other
        other = 'https://www.cumbria.ac.uk/media/university-of-cumbria-website/content-assets/public/finance/documents/studentfinance/fees/postgraduate-taught-tuition-fees-2018-19.pdf'

        #15.ielts 16171819
        if 'Occupational Therapy' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
            ielts_w = 6.5
        elif 'Physiotherapy' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
            ielts_w = 6.5
        elif 'Social Work' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
            ielts_w = 6.5
        else:
            ielts = 6.5
            ielts_r = 5.5
            ielts_s = 5.5
            ielts_l = 5.5
            ielts_w = 5.5
        #20.require_chinese_en
        require_chinese_en = '<p>Bachelor’s degree or equivalent.English Language: IELTS 6.5 with at least 6.0 in each section (or equivalent).</p>'
        #21.apply_pre
        apply_pre = '£'

        item['apply_pre'] = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['location'] = location
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['teach_time'] = teach_time
        item['start_date'] = start_date
        item['modules_en'] = modules_en
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements'] = rntry_requirements
        item['other'] = other
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        yield item

示例#17

显示文件

文件： UniversityofLeeds_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Leeds'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="main"]/div/header/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name_list = programme_en.split()
        degree_name = degree_name_list[-1]
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(programme_en)
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath('//*[@id="acc1"]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #7.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Modules')]//following-sibling::div").extract(
            )
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #8.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'Assessment')]//following-sibling::*"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = clear_space_str(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #9.start_date
        start_date = response.xpath(
            '//*[@id="keyfacts-acc"]/ul/li[1]/span[2]').extract()
        start_date = ''.join(start_date)
        start_date = clear_space_str(start_date)
        start_date = remove_tags(start_date)
        if 'September' in start_date:
            start_date = '2018-9'
        elif 'October' in start_date:
            start_date = '2018-10'
        elif 'January' in start_date:
            start_date = '2019-1'
        elif '6 August 2018' in start_date:
            start_date = '2018-8-6'
        elif '9 July 2018' in start_date:
            start_date = '2018-7-9'
        else:
            start_date = '2018-9'
        # print(start_date)

        #10.duration  #24.duration_per
        duration_list = response.xpath(
            "//*[contains(text(),'Duration/Mode')]//following-sibling::*"
        ).extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        duration_list = clear_space_str(duration_list)
        # print(duration_list)
        try:
            duration_a = re.findall('\d+', duration_list)[0]
        except:
            duration_a = 'N/A'
        if '6 weeks' in duration_list:
            duration = 6
            duration_per = 4
        elif '10 weeks' in duration_list:
            duration = 10
            duration_per = 4
        elif int(duration_a) > 3:
            duration = duration_a
            duration_per = 3
        else:
            duration = duration_a
            duration_per = 1
        # print(duration,'*******************',duration_per)

        #11.teach_time
        if 'full time' in duration_list:
            teach_time = 'full time'
        else:
            teach_time = 'part time'

        #12.ielts 13141516
        ielts_list = response.xpath(
            "//*[contains(text(),'Language requirements')]//following-sibling::*"
        ).extract()
        ielts_list = ''.join(ielts_list)
        ielts = re.findall('\d\.\d', ielts_list)
        # print(ielts)
        if len(ielts) == 2:
            a = ielts[0]
            b = ielts[1]
            ielts = a
            ielts_w = b
            ielts_r = b
            ielts_s = b
            ielts_l = b
        elif len(ielts) == 1:
            a = ielts[0]
            ielts = a
            ielts_w = a
            ielts_r = a
            ielts_s = a
            ielts_l = a
        elif len(ielts) == 3:
            a = ielts[0]
            b = ielts[1]
            c = ielts[2]
            ielts = a
            ielts_w = b
            ielts_r = c
            ielts_s = c
            ielts_l = c
        else:
            ielts = None
            ielts_w = None
            ielts_r = None
            ielts_s = None
            ielts_l = None
        # print(ielts,ielts_w,ielts_s,ielts_r,ielts_l)
        # print(ielts_s+ielts_s)

        #17.department
        department = response.xpath(
            "//*[contains(text(),'This course is taught by')]/../following-sibling::*"
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        department = clear_space_str(department)
        # print(department)

        #18.rntry_requirements
        rntry_requirements = response.xpath(
            "//*[contains(text(),'Entry requirements:')]//following-sibling::*[1]"
        ).extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = clear_space_str(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        if 'Full entry requirements' in rntry_requirements:
            rntry_requirements = rntry_requirements.replace(
                'Full entry requirements', '')
        else:
            pass
        # print(rntry_requirements)

        #19.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'International fees')]//following-sibling::*[1]"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = clear_space_str(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        try:
            tuition_fee = re.findall('\d+,\d+', tuition_fee)[0]
            tuition_fee = tuition_fee.replace(',', '')
        except:
            tuition_fee = None
        # print(tuition_fee)

        #20.tuition_fee_pre
        tuition_fee_pre = '£'

        #21.deadline
        deadline = response.xpath(
            "//*[contains(text(),'Application deadlines')]//following-sibling::*[1]|//*[contains(text(),'Application deadlines:')]/../following-sibling::*[1]"
        ).extract()
        deadline = ''.join(deadline)
        deadline = remove_tags(deadline)
        # print(deadline)
        if 'August' in deadline:
            deadline = '2018-8-31'
        elif 'July' in deadline:
            deadline = '2018-7-31'
        elif 'June' in deadline:
            deadline = '2018-6-30'
        elif 'April' in deadline:
            deadline = '2018-4-30'
        else:
            deadline = 'N/a'
        # print(deadline)

        #22.career_en
        career_en = response.xpath(
            "//h2[contains(text(),'Career opportunities')]/../following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #23.apply_proces_en
        apply_proces_en = 'https://application.leeds.ac.uk/login/?returnurl=%2f'
        #24.toefl 25262728
        toefl = 92
        toefl_l = 21
        toefl_r = 21
        toefl_s = 23
        toefl_w = 22
        #29.apply_pre
        apply_pre = '£'
        #30.apply_documents_en
        apply_documents_en = '<p>Make sure you have all your supporting documents scanned and ready to upload with your online application. All documents should be in English, or sent with certified translations into English. Without copies of the required documents we will be unable to make you an offer.</p>'

        item['apply_pre'] = apply_pre
        item['apply_documents_en'] = apply_documents_en
        item['toefl'] = toefl
        item['toefl_l'] = toefl_l
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_w'] = toefl_w
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['start_date'] = start_date
        item['duration'] = duration
        item['teach_time'] = teach_time
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['department'] = department
        item['rntry_requirements'] = rntry_requirements
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['deadline'] = deadline
        item['career_en'] = career_en
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per
        yield item

示例#18

显示文件

文件： UniversityofEssex_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Essex'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath('//*[@id="content"]//h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = programme_en.split()[0]
        # print(degree_name)
        programme_en = programme_en.replace(degree_name, '').strip()
        # print(programme_en)
        #6.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Start date')]//following-sibling::select"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        if 'Oct 2018/19' in start_date:
            start_date = '2018-10,2019-10'
        else:
            start_date = '2018-9,2019-9'
        # print(start_date)

        #7.teach_time
        teach_time = response.xpath(
            "//*[contains(text(),'Study mode')]//following-sibling::select"
        ).extract()
        teach_time = ''.join(teach_time)
        teach_time = remove_tags(teach_time)
        if 'Full Time' in teach_time:
            teach_time = 'Full Time'
        else:
            teach_time = 'Part Time'
        # print(teach_time)

        #8.duration #9.duration_per
        duration_list = response.xpath(
            "//*[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        duration_a = re.findall('\d', duration_list)[0]
        if duration_list == '1 years 8 months':
            duration = '20'
            duration_per = 3
        elif int(duration_a) < 5:
            duration = duration_a
            duration_per = 1
        else:
            duration = duration_a
            duration_per = 3
        # print(duration,'(((',duration_per)

        #10.location
        location = response.xpath(
            "//*[contains(text(),'Location')]//following-sibling::span"
        ).extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #11.department
        department_a = response.xpath(
            "//*[contains(text(),'Based in')]//following-sibling::*").extract(
            )
        department_a = ''.join(department_a)
        department_a = remove_tags(department_a)
        if len(department_a) > 500:
            department = 'N/A'
        else:
            department = department_a
        # print(department)

        #12.overview_en
        overview_en = response.xpath('//*[@id="overview"]//p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #13.ielts 14151617
        ielts_list = response.xpath(
            '//*[@id="entry-requirements"]//text()').extract()
        ielts_list = ''.join(ielts_list)
        ielts = re.findall('\d\.\d', ielts_list)
        # print(ielts)
        if '2.2' in ielts:
            ielts.remove('2.2')
            if '2.1' in ielts:
                ielts.remove('2.1')
            else:
                pass
        elif '2.1' in ielts:
            ielts.remove('2.1')
        else:
            pass
        # print(ielts)

        if len(ielts) == 2:
            a = ielts[0]
            b = ielts[1]
            ielts = a
            ielts_s = b
            ielts_w = b
            ielts_l = b
            ielts_r = b
        elif len(ielts) == 3:
            a = ielts[0]
            b = ielts[1]
            c = ielts[2]
            ielts = a
            ielts_w = b
            ielts_r = c
            ielts_l = c
            ielts_s = c
        elif len(ielts) == 4:
            a = ielts[0]
            b = ielts[1]
            ielts = a
            ielts_s = b
            ielts_w = b
            ielts_l = b
            ielts_r = b
        else:
            ielts = 6.0
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        # print(ielts,ielts_w,ielts_r,ielts_l,ielts_s)

        #18.modules_en
        modules_en = response.xpath(
            "//div[@class='tabs__panels content-padding']").extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #19.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'International fee')]//following-sibling::*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = tuition_fee.replace(',', '')
        tuition_fee = tuition_fee.replace('£', '')
        if tuition_fee == 'TBC':
            tuition_fee = None
        elif len(tuition_fee) >= 200:
            tuition_fee = None
        else:
            pass
        # print(tuition_fee)

        #20.tuition_fee_pre
        tuition_fee_pre = '£'

        #21.apply_proces_en
        apply_proces_en = 'https://www1.essex.ac.uk/pgapply/login.aspx'

        #22.rntry_requirements
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]/div//p[1]').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #23.require_chinese_en
        chi_url = re.findall(r'courses/pg(.*)/', url)[0]
        chi_url1 = re.findall('\d+', chi_url)
        a = chi_url1[0]
        b = chi_url1[1]
        chi_url2 = 'https://www.essex.ac.uk/api/sitecore/coursePage/EntryRequirementInternational?mastercourseid=PG' + str(
            a) + '&subgroupcode=' + str(b) + '&courseyear=18&countrykey=631'
        data = requests.get(chi_url2)
        data_list = etree.HTML(data.text)
        require_chinese_en = data_list.xpath('/html/body/div/p/text()')
        require_chinese_en = ''.join(require_chinese_en)
        require_chinese_en = '<p>' + require_chinese_en + '</p>'
        # print(require_chinese_en)

        #24.apply_documents_en

        apply_documents_en = "<p>Necessary documents When you apply to study with us, you'll need to provide a number of supporting documents - we can't process your application until we have these. Some of these documents you will have to upload with your application, others you may be able to upload at a later date. We may ask to see original documents if you are offered a place. English language If you have received your test results you may include a copy with your application. The main tests we accept are IELTS, TOEFL or Pearson, and the test must be less than two years old at the time of admission. The IELTS requirement for your course is listed on our Postgraduate Research Finder. You can also see more detailed information about English language requirements here (.pdf) Transcripts Official transcript(s), in English or a certified translation of your academic results to date, showing marks or grades, must be provided at the time you make your application. (Transcripts are not required from current or previous University of Essex students, or from students who have previously completed a degree at Colchester Institute awarded by the University of Essex). CV A CV is required for some research degrees at the time of application. Research proposal Requirements vary across departments but two references and a research proposal are required for all research degrees.  A research proposal is required at application stage for most research degrees. Think about your research idea - during your PhD you will conduct and present the results of your original investigations and research. You need to ensure that your research topic will be interesting enough for three or four years. Start to research your topic by reading around your subject area and begin to think what you might like to include in your research proposal. Get in touch with a suitable department by contacting the Graduate Director - you might still be developing your idea at this stage, but it would be great if you could send a short description of your research area and a copy of your CV. This does not need to be longer than one A4 page. You can search for a department or supervisor through our Postgraduate Research Finder. Writing your proposalYour research proposal is an important part of your application for a research degree. Use it to explain your personal and academic goals in undertaking an extended period of research, and reﬂect on the contribution you will make to the development of new knowledge, ideas and solutions. Also comment on how your research interests fit with the academic focus and expertise at Essex  Your research proposal needs to demonstrate that you have, or are able to develop, the competencies and skills needed to complete your project, within the time and resources available. The quality of your writing is important and a good research proposal may be rejected if it is poorly expressed or badly presented. Many of our departments, schools and centres offer more detailed guidance on preparing a research proposal on their web pages. If you are applying for funding, ensure your proposal fulfils the requirements of your preferred funding body. Your research proposal should include: a working title and key words a summary of the aims and objectives of your research an outline of the ways you meet these aims and objectives, referring to research methods and specific resources you use evidence of your awareness of relevant literature and theoretical approaches an overview of the expected outcomes and the original contribution your research will make to existing bodies of knowledge a brief statement on how your research interests tie in with those found in the department, school or centrePersonal statement If you are applying for a taught course and you need a Tier 4 student visa to study in the UK, then a personal statement (no more than 500 words) is required at the time you make your application, and this should refer specifically to your reasons for wishing to study in the UK, and why you have chosen your area of study. Please remember to include details of any relevant work experience, why you think your academic strengths are suited to your area of study, and how this study will assist you to realise your career objectives. References We require two references from you at the application stage.References should be recent and verifiable, on official institution paper, signed and dated by the referee. If a referee wishes to provide an email reference, it must be sent from an official email account (for example, not Yahoo, Gmail or Hotmail).<\p>"

        #25.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Your future')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #26.assessment_en
        assessment_en_1 = response.xpath(
            "//*[contains(text(),'Teaching')]//following-sibling::*").extract(
            )
        assessment_en_1 = ''.join(assessment_en_1)
        assessment_en_1 = remove_class(assessment_en_1)
        assessment_en_2 = response.xpath(
            "//*[contains(text(),'Assessment')]//following-sibling::*"
        ).extract()
        assessment_en_2 = ''.join(assessment_en_2)
        assessment_en_2 = remove_class(assessment_en_2)
        assessment_en_3 = response.xpath(
            "//*[contains(text(),'Dissertation')]//following-sibling::*"
        ).extract()
        assessment_en_3 = ''.join(assessment_en_3)
        assessment_en_3 = remove_class(assessment_en_3)
        assessment_en = assessment_en_1 + assessment_en_2 + assessment_en_3
        if len(assessment_en) > 30000:
            assessment_en = assessment_en[:30000]
        # print(assessment_en)

        item['apply_documents_en'] = apply_documents_en
        item['require_chinese_en'] = require_chinese_en
        item['rntry_requirements'] = rntry_requirements
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['start_date'] = start_date
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['location'] = location
        item['department'] = department
        item['overview_en'] = overview_en
        item['ielts'] = ielts
        item['ielts_w'] = ielts_w
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['modules_en'] = modules_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_proces_en'] = apply_proces_en
        item['career_en'] = career_en
        item['assessment_en'] = assessment_en
        yield item

示例#19

显示文件

文件： UniversityofGreenwich_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Greenwich'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="default"]/header/div/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        programme_en = clear_space_str(programme_en)
        programme_en = programme_en.split(',')
        programme_en = ''.join(programme_en[:-1])
        # print(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        try:
            degree_name = re.findall(r',(.*)', programme_en)[0].strip()
        except:
            degree_name = 'N/A'
        if ',' in degree_name:
            degree_name = re.findall(r',(.*)', degree_name)[0].strip()
        try:
            programme_en = programme_en.replace(degree_name,
                                                '').replace(',', '').strip()
        except:
            pass
        # print(programme_en)
        # print(degree_name)

        #6.department
        department = response.xpath(
            '//i[@aria-label="Department"]//following-sibling::*').extract()
        department = ''.join(department)
        department = remove_tags(department)
        if '&amp; ' in department:
            department = department.replace('&amp; ', '')
        # print(department)

        #7.location
        location = response.xpath(
            '//i[@aria-label="Location"]//following-sibling::*').extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #8.ucascode
        ucascode = response.xpath(
            '//*[@id="faculty"]/div[2]/article/div/div/div[1]/div[2]/div[2]/div/h3'
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = ucascode[:4]
        # print(ucascode)

        #9.duration #10.duration_per
        duration = response.xpath(
            '//*[@aria-label="Duration"]//following-sibling::p[1]').extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        duration = duration.replace('full time', '').strip()
        # print(duration)
        duration_per = 1

        #11.overview_en
        overview_en = response.xpath(
            "//div[contains(@class,'overview-text')]").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #12.modules_en
        modules_en_url = response.xpath(
            "//meta[@name='prog_no']//@content").extract()
        modules_en_url = ''.join(modules_en_url)
        if len(modules_en_url) != 0:
            modules_en_url = 'https://www.gre.ac.uk/ug/content/ajax/courses-ajax-call?prog=' + str(
                modules_en_url)
        else:
            modules_en_url = ''
        if len(modules_en_url) != 0:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
            }
            data = requests.get(modules_en_url, headers=headers)
            response1 = etree.HTML(data.text)
            modules_en = response1.xpath("//div[@class='gre-page-copy']")
            doc = ""
            if len(modules_en) > 0:
                for a in modules_en:
                    doc += (etree.tostring(a,
                                           encoding='unicode',
                                           pretty_print=False,
                                           method='html'))
                    doc = remove_class(doc)
        else:
            modules_en = 'N/A'
            doc = ''
        print(modules_en)

        #13.apply_desc_en
        apply_desc_en = response.xpath(
            '//*[@id="entry-requirements"]/div').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en).strip()
        # print(apply_desc_en)

        #14.assessment_en
        try:
            assessment_en = response.xpath(
                "//h3[contains(text(),'Careers')]//preceding-sibling::*"
            ).extract()
            assessment_en = ''.join(assessment_en)
            assessment_en = remove_class(assessment_en)
        except:
            assessment_en = ''
        # print(assessment_en)

        #15.career_en
        career_en = response.xpath(
            "//h4[contains(text(),'Do you provide employability services?')]//preceding-sibling::*"
        ).extract()
        if len(career_en) == 0:
            career_en = response.xpath(
                "//h4[contains(text(),'areers')]/following-sibling::*"
            ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #16.tuition_fee
        if 'Adult Nursing' in programme_en:
            tuition_fee = 13950
        elif 'Business Logistics and Transport Management' in programme_en:
            tuition_fee = 13950
        elif 'Business Purchasing and Supply Chain Management' in programme_en:
            tuition_fee = 13950
        elif 'Business Studies' in programme_en:
            tuition_fee = 13950
        elif 'Business with Accounting' in programme_en:
            tuition_fee = 13950
        elif 'Business with Finance' in programme_en:
            tuition_fee = 13950
        elif 'Business with Human Resource Management' in programme_en:
            tuition_fee = 13950
        elif 'Business with Marketing' in programme_en:
            tuition_fee = 13950
        elif "Children's Nursing" in programme_en:
            tuition_fee = 13950
        elif 'Law' in programme_en:
            tuition_fee = 13950
        elif 'Nursing' in programme_en:
            tuition_fee = 13950
        elif 'Mental Health Work' in programme_en:
            tuition_fee = 13950
        elif 'Midwifery' in programme_en:
            tuition_fee = 13950
        elif 'Paramedic Science' in programme_en:
            tuition_fee = 13950
        elif 'Specialist Community Public Health' in programme_en:
            tuition_fee = 13950
        elif 'Study Abroad' in programme_en:
            tuition_fee = 13950
        else:
            tuition_fee = 12100

        #17.tuition_fee_pre
        tuition_fee_pre = '£'

        #18.apply_proces_en
        apply_proces_en = 'https://www.gre.ac.uk/study/apply/ug'

        #19.ielts 20212223
        ielts = 6.5
        ielts_r = 5.5
        ielts_w = 5.5
        ielts_s = 5.5
        ielts_l = 5.5

        #24.apply_pre
        apply_pre = '£'

        #25.alevel
        alevel = response.xpath(
            "//*[contains(text(),'UCAS points')]//following-sibling::*[1]"
        ).extract()
        if len(alevel) == 0:
            alevel = response.xpath(
                "//*[contains(text(),'points')]//text()").extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        try:
            alevel = re.findall('(\d+)\W\(view', alevel)[0]
        except:
            alevel = ''
        if len(alevel) == 0:
            alevel = response.xpath(
                "//*[contains(text(),'points')]//text()").extract()
            alevel = ''.join(alevel)
            try:
                alevel = re.findall('(\d+)\WUCAS', alevel)[0]
            except:
                alevel = None
        alevel = alevel + ' UCAS points'
        # print(alevel)

        item["alevel"] = alevel
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['department'] = department
        item['location'] = location
        item['ucascode'] = ucascode
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        # item['modules_en'] = doc
        item['apply_desc_en'] = apply_desc_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_proces_en'] = apply_proces_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        yield item

示例#20

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Cardiff University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en_a = response.xpath(
            '//*[@id="content"]/div[1]/div/div[1]/h1').extract()
        programme_en_a = ''.join(programme_en_a)
        programme_en_a = remove_tags(programme_en_a)
        if '(' in programme_en_a:
            programme_en = programme_en_a.split()[:-1]
            programme_en = ' '.join(programme_en)
        else:
            programme_en = programme_en_a
        # print(programme_en)

        #4.overview_en
        overview_en = response.xpath(
            '//*[@id="content"]/div[1]/div/div[1]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #5.start_date
        start_date = '2019-9'

        #6.duration #7.duration_per
        duration_list = response.xpath(
            "//*[contains(text(),'Duration')]//following-sibling::*").extract(
            )
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        try:
            duration_a = re.findall('\d+', duration_list)[0]
        except:
            duration_a = 1
        if 'five years' in duration_list:
            duration = 5
        elif 'seven years' in duration_list:
            duration = 7
        else:
            duration = duration_a
        if 'months' in duration_list:
            duration_per = 3
        else:
            duration_per = 1
        # print(duration,'********',duration_per)

        #8.degree_name
        degree_name = response.xpath(
            '//*[@id="content"]/div[1]/div/div[1]/h1').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        degree_name = degree_name.split()[-1]
        degree_name = degree_name.replace('(', '').replace(')', '').strip()
        # print(degree_name)

        #9.ucascode
        ucascode = response.xpath(
            '//*[@id="section1"]/table[1]//tr[1]/td').extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        # print(ucascode)

        #10.modules_en
        # modules_en = response.xpath('//*[@id="coreModulesList-1"]/div/table//tr/td/a').extract()
        # modules_en = ''.join(modules_en)
        # modules_en = remove_class(modules_en)
        # modules_en = '<p>'+modules_en+'</p>'
        modules_en = response.xpath('//*[@id="section2"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)

        # print(modules_en)

        #11.alevel
        alevel = response.xpath(
            "//*[contains(text(),'A level ')]//following-sibling::*").extract(
            )
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel).strip()
        # print(alevel)

        #12.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'How will I be assessed?')]//following-sibling::p"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #13.career_en
        career_en = response.xpath(
            '//*[@id="section5"]/div[1]/div[1]/p').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #14.tuition_fee
        tuition_fee = response.xpath(
            '//*[@id="tuitionfees"]/table/tbody/tr/td[1]').extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #15.ib
        ib = response.xpath(
            "//*[contains(text(),'International Baccalaureate')]//following-sibling::*"
        ).extract()
        ib = ''.join(ib)
        ib = remove_tags(ib).strip()
        # print(ib)

        #16.tuition_fee_pre
        tuition_fee_pre = '£'

        #17 18192021
        if 'Dentistry' in programme_en:
            ielts = 7.0
            ielts_s = 6.5
            ielts_r = 6.5
            ielts_l = 6.5
            ielts_w = 6.5
        elif 'Medicine' in programme_en:
            ielts = 7.0
            ielts_s = 6.5
            ielts_r = 6.5
            ielts_l = 6.5
            ielts_w = 6.5
        elif 'Law' in programme_en:
            ielts = 6.5
            ielts_s = 6.5
            ielts_r = 6.0
            ielts_l = 6.0
            ielts_w = 6.0
        elif 'Politics' in programme_en:
            ielts = 6.5
            ielts_s = 6.5
            ielts_r = 6.0
            ielts_l = 6.0
            ielts_w = 6.0
        else:
            ielts = 6.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_l = 5.5
            ielts_w = 5.5
        # print(ielts,ielts_s,ielts_r,ielts_l,ielts_w)

        #22.degree_type
        degree_type = 1

        item['ib'] = ib
        item['alevel'] = alevel
        item['ucascode'] = ucascode
        item['degree_name'] = degree_name
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['start_date'] = start_date
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['degree_type'] = degree_type
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['ielts'] = ielts
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        yield item

示例#21

显示文件

文件： LondonMetropolitanUniversity_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'London Metropolitan University'
        # print(university)

        # 2.url
        url = response.url

        #3.programme_en
        programme_en = response.xpath(
            '/html/body/div[1]/div/h1/span').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).strip()
        # print(programme_en,url)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        try:
            degree_name = re.findall(r'-\s[A-Za-z\s/]+$', programme_en)[0]
        except:
            degree_name = ''
        programme_en = programme_en.replace(degree_name, '').strip()
        degree_name = degree_name.replace('-', '').strip()
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="why-study-this-course"]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'Assessment')]//following-sibling::*"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = clear_space_str(assessment_en)
        # print(assessment_en)

        #8.modules_en
        modules_en = response.xpath('//*[@id="modular-structure"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en).replace('▼', '')
        # print(modules_en)

        #9.career_en
        career_en = response.xpath('//*[@id="after-the-course"]').extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #10.ielts 11121314
        if 'Education' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'Creative, Digital and Professional Writing' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'Interpreting' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'LLM' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'Psychology' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'Teaching Languages (English) - MA' in programme_en:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        elif 'Biomedical Science - MSc' in programme_en:
            ielts = 7.0
            ielts_w = 6.5
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
        elif 'Blood Science (Distance Learning) - MSc' in programme_en:
            ielts = 7.0
            ielts_w = 6.5
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
        elif 'Common Professional Exam' in programme_en:
            ielts = 7.0
            ielts_w = 6.5
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
        elif 'Legal Practice Course' in programme_en:
            ielts = 7.0
            ielts_w = 6.5
            ielts_r = 6.5
            ielts_s = 6.5
            ielts_l = 6.5
        else:
            ielts = 6.0
            ielts_w = 5.5
            ielts_r = 5.5
            ielts_s = 5.5
            ielts_l = 5.5
        # print(ielts,ielts_l,ielts_r,ielts_s,ielts_w)

        #15.tuition_fee
        tuition_fee = response.xpath(
            "//optgroup[@label='International']/option[1]/@data-cost").extract(
            )
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #16.duration 17.duration_per
        duration_list = response.xpath(
            "//optgroup[@label='International']/option[1]/@data-duration"
        ).extract()
        duration_list = ''.join(duration_list)
        if len(duration_list) != 0:
            duration = re.findall('\d+', duration_list)[0]
            if 'months' in duration_list:
                duration_per = 3
            elif 'year' in duration_list:
                duration_per = 1
            elif 'weeks' in duration_list:
                duration_per = 4
            else:
                duration_per = None
        else:
            duration = None
            duration_per = None
        # print(duration,"*************",duration_per)

        #18.tuition_fee_pre
        tuition_fee_pre = '£'

        #19.teach_time
        teach_time = 'Full time'

        #20.location
        location = 'London'
        #21.apply_pre
        apply_pre = '£'
        #22.rntry_requirements
        rntry_requirements = response.xpath(
            '//*[@id="entry-requirements"]').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #23.require_chinese_en
        require_chinese_en = "<p>A completed bachelor's degree from a high ranking Chinese institution Grade: 70% or above</p>"

        item['require_chinese_en'] = require_chinese_en
        item['rntry_requirements'] = rntry_requirements
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['assessment_en'] = assessment_en
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['teach_time'] = teach_time
        item['location'] = location
        yield item

示例#22

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Leeds'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="main"]/div/header/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name_list = programme_en.split()
        degree_name = degree_name_list[-1]
        programme_en = programme_en.replace(degree_name,
                                            '').strip().replace(',', '')
        # print(programme_en)
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath('//*[@id="acc1"]/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #7.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Modules')]//following-sibling::*[position()<7]"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # end = modules_en.find()
        # modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #8.assessment_en
        assessment_en = response.xpath(
            "//*[contains(text(),'Assessment')]//following-sibling::*"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = clear_space_str(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #9.start_date
        start_date = '2019-9'
        # print(start_date)

        #10.duration  #24.duration_per
        duration_list = response.xpath(
            "//*[contains(text(),'Duration/Mode')]//following-sibling::*"
        ).extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        duration_list = clear_space_str(duration_list)
        # print(duration_list)
        try:
            duration = re.findall('\d+', duration_list)[0]
        except:
            duration = 3
        if int(duration) > 10:
            duration_per = 3
        else:
            duration_per = 1
        # print(duration,'*******************',duration_per)

        #11.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #12.ielts 13141516
        ielts_list = response.xpath('//*[@id="acc3"]').extract()
        ielts_list = ''.join(ielts_list)
        ielts = re.findall(r'[567]\.\d', ielts_list)
        if len(ielts) > 1:
            a = ielts[0]
            b = ielts[1]
            ielts = a
            ielts_r = b
            ielts_l = b
            ielts_w = b
            ielts_s = b
        elif len(ielts) == 1:
            a = ielts[0]
            ielts = a
            ielts_r = a
            ielts_l = a
            ielts_w = a
            ielts_s = a
        else:
            ielts = None
            ielts_r = None
            ielts_l = None
            ielts_w = None
            ielts_s = None
        # print(ielts,ielts_w,ielts_s,ielts_r,ielts_l)
        # print(ielts_s+ielts_s)

        #17.department
        department = response.xpath(
            "//*[contains(text(),'This course is taught by')]/../following-sibling::*"
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        department = clear_space_str(department)
        # print(department)

        #18.alevel
        alevel = response.xpath('//*[@id="acc3"]/p[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel).replace('A-level:', '').strip()
        # print(alevel)

        #19.tuition_fee
        tuition_fee = response.xpath('//*[@id="acc3"]').extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = clear_space_str(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #20.tuition_fee_pre
        tuition_fee_pre = '£'

        #21.ib
        ib = response.xpath(
            "//*[contains(text(),'International Baccalaureate')]//following-sibling::*"
        ).extract()
        try:
            ib = ib[-1]
            ib = remove_tags(ib)
        except:
            ib = 'N/A'
        # print(ib)

        #22.career_en
        career_en = response.xpath(
            "//h2[contains(text(),'Career opportunities')]/../following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        career_en = clear_space_str(career_en)
        # print(career_en)

        #23.apply_proces_en
        apply_proces_en = 'https://application.leeds.ac.uk/login/?returnurl=%2f'
        #24.toefl 25262728
        toefl = 87
        toefl_l = 20
        toefl_r = 20
        toefl_s = 22
        toefl_w = 21
        #29.apply_pre
        apply_pre = '£'
        #30.apply_documents_en
        apply_documents_en = '<p>Make sure you have all your supporting documents scanned and ready to upload with your online application. All documents should be in English, or sent with certified translations into English. Without copies of the required documents we will be unable to make you an offer.</p>'

        item['ib'] = ib
        item['alevel'] = alevel
        item['apply_pre'] = apply_pre
        item['apply_documents_en'] = apply_documents_en
        item['toefl'] = toefl
        item['toefl_l'] = toefl_l
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_w'] = toefl_w
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['start_date'] = start_date
        item['duration'] = duration
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['department'] = department
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['career_en'] = career_en
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per

        ucascode_a = response.xpath(
            "//*[contains(text(),'UCAS code:')]//following-sibling::span//text()"
        ).extract()
        if len(ucascode_a) > 1:
            for i in ucascode_a:
                ucascode_list = i
                ucascode_list = ucascode_list.strip()
                ucascode_a = re.findall(r':(.*)', ucascode_list)[0].strip()
                degree_name_a = re.findall(r'(.*):', ucascode_list)[0].strip()
                item['ucascode'] = ucascode_a
                item['degree_name'] = degree_name_a
                yield item

        else:
            item['ucascode'] = ucascode
            item['degree_name'] = degree_name
            yield item

示例#23

显示文件

文件： TheUniversityofWesternAustralia_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'The University of Western Australia'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract(
            )
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        if 'Master of' in programme_en:
            programme_en = programme_en.replace('Master of', '').strip()
        # print(programme_en)

        #4.overview_en
        overview_en = response.xpath(
            '//*[@id="course-details"]/div/div/div/section/div[1]/div[1]/div[1]/div/div'
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #5.modules_en
        modules_en = response.xpath(
            "//h2[contains(text(),'Course structure details')]//following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #6.teach_time
        teach_time = 'coursework'

        #7.location
        location = response.xpath(
            "//*[contains(text(),'Locations')]//following-sibling::*").extract(
            )[0]
        # location = ''.join(location)
        location = remove_tags(location).strip()
        location = clear_space_str(location)
        # print(location)

        #8.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Starting dates')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date).strip()
        if 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = 'Semester1,Semester2'
        # print(start_date)

        #9.career_en
        career_en = response.xpath(
            '//*[@id="careers-and-further-study"]/div/div/div/section/div[2]/div/div/div/a'
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #10.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'fee')]//following-sibling::div").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        if tuition_fee == 0:
            tuition_fee = response.xpath(
                "//*[contains(text(),'Fee')]//following-sibling::div").extract(
                )
            tuition_fee = ''.join(tuition_fee)
            tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee,response.url)

        #11.tuition_fee_pre
        tuition_fee_pre = '$'

        #12.rntry_requirements_en
        rntry_requirements_en = response.xpath(
            "//*[contains(text(),'Admission Requirements')]//following-sibling::div"
        ).extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #13.ielts 14151617
        if 'MBA' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Health' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Educational Leadership' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Forensic Odontology' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Dental Medicine' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Dentistry' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Medicine':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Podiatric Medicine' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Neuropsychology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Psychology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Audiology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif 'Clinical Audiology' in programme_en:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Industrial and Organisational Psychology':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Pharmacy':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Social Work':
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        elif programme_en == 'Education':
            ielts = 7.5
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif programme_en == 'Teaching':
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 8.0
            ielts_s = 8.0
        elif 'Law' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Juris Doctor' in programme_en:
            ielts = 7.5
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0

        #18.toefl 19202122
        if 'Law' in programme_en:
            toefl = 100
            toefl_s = 28
            toefl_l = 26
            toefl_r = 26
            toefl_w = 26
        elif 'Juris Doctor' in programme_en:
            toefl = 106
            toefl_s = 28
            toefl_l = 26
            toefl_r = 26
            toefl_w = 26
        elif 'MBA' in programme_en:
            toefl = 100
            toefl_s = 20
            toefl_l = 20
            toefl_r = 20
            toefl_w = 20
        elif 'MBA' in programme_en:
            toefl = 100
            toefl_s = 20
            toefl_l = 20
            toefl_r = 20
            toefl_w = 20
        elif 'Clinical Neuropsychology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Clinical Psychology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Clinical Audiology' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        elif 'Industrial and Organisationa' in programme_en:
            toefl = 94
            toefl_s = 23
            toefl_l = 24
            toefl_r = 24
            toefl_w = 27
        else:
            toefl = 82
            toefl_s = 20
            toefl_l = 20
            toefl_r = 18
            toefl_w = 22

        #23.apply_proces_en
        apply_proces_en = 'Check your chosen course is open to applications. Ensure you meet the admission requirements for this course as detailed on the previous tab. Ensure you meet our English language competency requirement and any course/major prerequisites. Apply'

        #24.apply_pre
        apply_pre = '$'

        #25.apply_fee
        apply_fee = 100

        #26.degree_name
        degree_name = response.xpath(
            '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract(
            )
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #27.degree_type
        degree_type = 2

        #28.duration
        duration = response.xpath(
            "//*[contains(text(),'duration')]//following-sibling::*[1]//ul//li"
        ).extract()
        duration = ''.join(duration)
        if '<li>1.5' in duration:
            duration = 1.5
        elif '<li>1 to 2' in duration:
            duration = '1/2'
        elif '<li>0.5-1.5' in duration:
            duration = '0.5/1.5'
        elif '<li>2-3' in duration:
            duration = '2/3'
        elif '<li>One' in duration:
            duration = 1
        elif '<li>Two' in duration:
            duration = 2
        else:
            duration = re.findall(r'\d+', duration)[0]
        # print(duration,url)

        item['duration'] = duration
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['teach_time'] = teach_time
        item['location'] = location
        item['start_date'] = start_date
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements_en'] = rntry_requirements_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['apply_proces_en'] = apply_proces_en
        item['apply_pre'] = apply_pre
        item['apply_fee'] = apply_fee
        yield item

示例#24

显示文件

文件： TheUniversityofWesternAustralia_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'The University of Western Australia'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract(
            )
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.overview_en
        overview_en = response.xpath(
            '//*[@id="course-details"]/div/div/div/section/div[1]/div[1]/div[1]/div/div'
        ).extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #5.modules_en
        modules_en = response.xpath(
            "//h2[contains(text(),'Course structure details')]//following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        modules_en = clear_space_str(modules_en)
        # print(modules_en)

        #6.apply_pre
        apply_pre = '$'

        #8.start_date
        start_date = response.xpath(
            "//*[contains(text(),'Starting dates')]//following-sibling::*"
        ).extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date).strip()
        if 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = 'Semester1,Semester2'
        # print(start_date)

        #9.career_en
        career_en = response.xpath(
            '//*[@id="careers-and-further-study"]/div/div/div/section/div[2]/div/div/div/a'
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en).strip()
        # print(career_en)

        #10.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'Fee')]//following-sibling::div").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee,response.url)

        #11.tuition_fee_pre
        tuition_fee_pre = '$'

        #12.rntry_requirements_en
        rntry_requirements_en = response.xpath(
            "//h3[contains(text(),'Admission requirements')]//following-sibling::div[1]"
        ).extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #13.ielts 14151617
        if 'Law' in programme_en:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_l = 6.0
            ielts_s = 6.0

        #18.toefl 19202122
        if 'Law' in programme_en:
            toefl = 100
            toefl_s = 28
            toefl_l = 26
            toefl_r = 26
            toefl_w = 26
        else:
            toefl = 82
            toefl_s = 20
            toefl_l = 20
            toefl_r = 18
            toefl_w = 22

        #23.apply_proces_en
        apply_proces_en = 'Check your chosen course is open to applications. Ensure you meet the admission requirements for this course as detailed on the previous tab. Ensure you meet our English language competency requirement and any course/major prerequisites. Apply'

        #24.apply_fee
        apply_fee = 100

        #25.china_score_requirements
        try:
            china_score_requirements = response.xpath(
                "//*[contains(text(),'Chinese Gao Kao')]//following-sibling::div/ul/li"
            ).extract()[0]
            china_score_requirements = remove_tags(china_score_requirements)
        except:
            china_score_requirements = ''
        #26.degree_type
        degree_type = 1

        #28.department
        department = response.xpath(
            "//*[contains(text(),'Faculty')]//following-sibling::div[1]/ul/li[1]"
        ).extract()
        department = ''.join(department)
        department = remove_tags(department).replace('&amp;', '')
        # print(department)

        #29.duration_per
        duration_per = 1

        #30.duration
        duration_list = response.xpath(
            "//*[contains(text(),'duration')]//following-sibling::div[1]/ul/li[1]"
        ).extract()
        duration_list = ''.join(duration_list)
        # print(duration_list)
        try:
            duration = re.findall('\d', duration_list)[0]
        except:
            duration = 3
        # print(duration)

        # 7.location
        location = response.xpath(
            "//*[contains(text(),'Locations')]//following-sibling::*").extract(
            )[0]
        # location = ''.join(location)
        location = remove_tags(location).strip()
        # location = clear_space_str(location)
        if 'Perth' in location and 'Albany' in location:
            location = 'Albany,Perth'
        else:
            location = location

        item['location'] = location
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['department'] = department
        item['degree_type'] = degree_type
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['start_date'] = start_date
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements_en'] = rntry_requirements_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_l'] = toefl_l
        item['toefl_w'] = toefl_w
        item['apply_proces_en'] = apply_proces_en
        item['apply_pre'] = apply_pre
        item['apply_fee'] = apply_fee
        item['china_score_requirements'] = china_score_requirements

        # 27.degree_name
        degree_name = response.xpath(
            "//*[contains(text(),'Degrees course is available in')]//following-sibling::div[1]//ul//li"
        ).extract()
        if len(degree_name) != 0:
            for i in degree_name:
                degree_name = i
                degree_name = degree_name.replace('<li>',
                                                  '').replace('</li>', '')
                item['degree_name'] = degree_name
                yield item
        else:
            item['degree_name'] = ''
            yield item

示例#25

显示文件

文件： SheffieldHallamUniversity_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Sheffield Hallam University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath("/html/body/section[1]//h1").extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en,response.url)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = response.xpath('/html/body/section[1]/div/div[2]/span').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        # print(degree_name)

        #6.tuition_fee
        tuition_fee = response.xpath("//*[contains(text(),'What is the fee?')]//following-sibling::*").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee =getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #7.tuition_fee_pre
        tuition_fee_pre = '£'

        #8.duration
        duration_list = response.xpath("//*[contains(text(),'How long will I study?')]//following-sibling::*").extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list).strip()
        duration =duration_list
        duration_per = 1

        #9.location
        location = 'Sheffield'

        #10.ucascode
        ucascode = response.xpath("//*[contains(text(),'What is the UCAS code?')]//following-sibling::*").extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #11.overview_en
        overview_en = response.xpath("//*[contains(text(),'Course summary')]//following-sibling::*").extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #12.career_en
        career_en = response.xpath("//*[contains(text(),'Future careers')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #13.alevel
        alevel = response.xpath('//*[@id="entry-requirements"]/div/div[1]/ul[2]/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        # print(alevel)


        #14.apply_proces_en
        apply_proces_en = response.xpath('//*[@id="apply-now"]/div[1]//a/@href').extract()
        apply_proces_en = ''.join(apply_proces_en)
        # print(apply_proces_en)

        #16.duration_per
        duration_per = 1

        #17.ielts_desc
        ielts_desc = response.xpath('//*[@id="entry-requirements"]/div/div[1]').extract()
        ielts_desc = ''.join(ielts_desc)
        ielts_desc = remove_tags(ielts_desc)
        ielts_list = re.findall(r'[567]\.\d',ielts_desc)
        # print(ielts_list,response.url)
        if len(ielts_list) == 2:
            a = ielts_list[0]
            b = ielts_list[1]
            ielts = a
            ielts_r = b
            ielts_l = b
            ielts_s = b
            ielts_w = b
        else:
            ielts = None
            ielts_r = None
            ielts_l = None
            ielts_s = None
            ielts_w = None

        #18.require_chinese_en
        require_chinese_en = '<p>The following qualifications from China will be considered for entry on to undergraduate programmes, with a minimum average of 60 per cent: Diploma from Specialised College (Zhongzhnan) Diploma from Vocational Secondary School (Zhixiao) Three year middle school diploma plus foundation degree A levels Graduate Diploma from: Radio and TV Universities Spare Time Universities Training Colleges for Administrative cadres Higher Education Self Study Examinations Adult Education/Adult Education in Science and Technology subjects Senior High School Diploma Chinese University Entrance Examination (until 2003) College Graduation Diploma (Dazhuan awarded by university/college on completion of 2-3 years study) Applicants who have completed the first year of an undergraduate degree at a Chinese university may be considered for direct entry to Sheffield Hallam University undergraduate programmes.Sheffield Hallam welcomes applications from international school students taking the International Baccalaureate Diploma and those achieving 28 points or more will usually be successful in obtaining an offer of a place on our undergraduate programmes. For information about IB points equivalences against the UCAS tariff please visit the UCAS website.</p>'
        #19.apply_fre
        apply_pre = '£'
        #20.start_date
        start_date = response.xpath("//*[contains(text(),'When do I start?')]//following-sibling::*").extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = clear_space_str(start_date)
        # print(start_date)
        if'September, January' in start_date:
            start_date = '2018-9,2019-1'
        elif 'January' in start_date:
            start_date = '2019-1'
        else:
            start_date = translate_month(start_date)
            start_date = '2018-'+str(start_date)
        # print(start_date)

        #21.modules_en
        modules_en = response.xpath('//div[@data-section="split"][6]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        item['modules_en'] = modules_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['start_date'] = start_date
        item['apply_pre'] = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['ielts_desc'] = ielts_desc
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['duration'] = duration
        item['location'] = location
        item['ucascode'] = ucascode
        item['overview_en'] = overview_en
        item['career_en'] = career_en
        item['alevel'] = alevel
        item['apply_proces_en'] = apply_proces_en
        item['duration_per'] = duration_per
        yield  item

示例#26

显示文件

文件： GoldSmithUniversityofLondon_p.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        # 1.university
        university = 'Goldsmiths, University of London'
        # print(university)

        # 2.department
        try:
            department = response.xpath('//*[@id="maincontent"]/article/section[1]/div/div/div/div[1]/p/a').extract()
            department = ''.join(department)
            department = clear_space_str(department)
            department = remove_tags(department)
            # print(department)
        except:
            department = 'N/A'

        # 3.programme_en
        try:
            programme_en_a = response.xpath('//*[@id="maincontent"]/article/header/div/div/div/div[1]/div[1]/div/h1/span').extract()
            programme_en_a = ''.join(programme_en_a)
            # programme_en_a = clear_space_str(programme_en_a)
            programme_en_a = remove_tags(programme_en_a)
            # programme_en_a = programme_en_a.replace('&amp','')
            # print(programme_en)
        except:
            programme_en = 'N/A'
        programme_en = programme_en_a.split()[2:]
        programme_en = ' '.join(programme_en)
        if ';'in programme_en:
            programme_en = programme_en.replace(';',' ')
        if 'in ' in programme_en:
            programme_en = programme_en.replace('in ','')
        programme_en = programme_en.strip()
        # print(programme_en,response.url)


        # 4.overview_en
        try:
            overview_en = response.xpath('//*[@id="maincontent"]/article/section[2]/div/div/div').extract()
            overview_en = ''.join(overview_en)
            overview_en = clear_space_str(overview_en)
            overview_en = remove_class(overview_en)
            # print(overview_en)
        except:
            overview = 'N/A'

        # 5.duration
        try:
            duration = response.xpath('//*[@id="maincontent"]/article/section[1]/div/div/div/div[2]/p').extract()
            duration = ''.join(duration)
            duration = re.findall('\d',duration)[0]
            duration = remove_tags(duration)
            # print(duration)
        except:
            duration = None

        #6.duration_per
        duration_per = 1

        # 7.modules_en
        try:
            modules_en  = response.xpath('//*[@id="maincontent"]/article/section[3]/div/div').extract()
            modules_en  = ''.join(modules_en)
            modules_en = remove_class(modules_en)
            modules_en  = clear_space_str(modules_en)
        except:
            modules_en= 'N/A'

        # 8.career_en
        try:
            career_en = response.xpath('//*[@id="maincontent"]/article/section[7]/div/div').extract()
            career_en = ''.join(career_en)
            career_en = remove_class(career_en)
            career_en = clear_space_str(career_en)
            # print(career_en)
        except:
            career_en = 'N/A'

        # 9.other
        other = 'https://www.gold.ac.uk/media/study-section/fees/PG-Fees-1819.pdf'

        #10.apply_proces_en
        apply_proces_en = response.xpath('//*[@id="maincontent"]/article/section[6]/div/div').extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = clear_space_str(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en)
        # print(apply_proces_en)

        # 11.-15.雅思(听说读写)
        try:
            IELTS_list = response.xpath(
                '//h3[contains(text(),"International qualifications")]/following-sibling::p').extract()
            IELTS_list = ''.join(IELTS_list)
            IELTS_list = remove_tags(IELTS_list)
            pat = re.findall('\d\.\d', IELTS_list)

            if len(pat) == 3:
                ielts = pat[0]
                ielts_w = pat[1]
                ielts_r = pat[2]
                ielts_s = pat[2]
                ielts_l= pat[2]
            elif len(pat) == 2:
                ielts = pat[0]
                ielts_w = pat[1]
                ielts_r = None
                ielts_s = None
                ielts_l = None
            else:
                ielts = 6.5
                ielts_w = 6.0
                ielts_r = 6.0
                ielts_s = 6.0
                ielts_l = 6.0
            ielts = clear_space_str(ielts)
            ielts_r = clear_space_str(ielts_r)
            ielts_w = clear_space_str(ielts_w)
            ielts_s = clear_space_str(ielts_s)
            ielts_l = clear_space_str(ielts_l)
            # print(ielts)
            # print(ielts_r,ielts_w,ielts_s,ielts_l)
        except:
            ielts = 6.5
            ielts_w = 6.0
            ielts_r = 6.0
            ielts_s = 6.0
            ielts_l = 6.0

        # 14.rntry_requirements
        rntry_requirements =response.xpath('//*[@id="maincontent"]/article/section[4]/div/div').extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        # 15.apply_documents_en
        try:
            apply_documents_en = response.xpath(
                '//h3[contains(text(),"When to apply")]/preceding-sibling::ul').extract()
            apply_documents_en = ''.join(apply_documents_en)
            apply_documents_en = clear_space_str(apply_documents_en)
            apply_documents_en = remove_class(apply_documents_en)
            # print(apply_documents_en)
        except:
            apply_documents_en = 'N/A'


        # 16.url
        url = response.url
        # print(url)

        #17.degree_type
        degree_type = 2

        #18.degree_name
        if 'MA' in programme_en_a:
            degree_name = 'MA'
        elif 'MSc' in programme_en_a:
            degree_name = 'MSc'
        elif 'PGCert' in programme_en_a:
            degree_name = 'PGCert'
        elif 'MMus' in programme_en_a:
            degree_name = 'MMus'
        elif 'MRes' in programme_en_a:
            degree_name = 'MRes'
        elif 'MPhil' in programme_en_a:
            degree_name = 'MPhil'
        elif 'MFA' in programme_en_a:
            degree_name = 'MFA'
        elif 'MMus' in programme_en_a:
            degree_name = 'MMus'
        elif 'PhD' in programme_en_a:
            degree_name = 'PhD'
        else:
            degree_name = 'Graduate'
        # print(degree_name)

        #19.location
        location = 'London'

        #20.apply_pre
        apply_pre  = '£'


        #21.require_chinese_en
        require_chinese_en = '<p>Postgraduate taught For entry to postgraduate programmes you will normally need a Bachelors degree in relevant subject. Refer to individual course pages to see whether there are any additional application requirements.Research degrees You will normally need to have completed a Masters degree in a subject relevant to your proposed postgraduate study. There may also be other specific entrance requirements. You can refer to individual course pages to find out what these are.</p>"'

        #22.assessment_en
        assessment_en = response.xpath("//*[contains(text(),'Assessment')]//following-sibling::*").extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en,url)

        item['assessment_en'] = assessment_en
        item['require_chinese_en'] = require_chinese_en
        item['apply_pre'] = apply_pre
        item['university'] = university
        item['department'] = department
        item['programme_en'] = programme_en
        item['overview_en'] = overview_en
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['modules_en'] = modules_en
        item['career_en'] = career_en
        item['other'] = other
        item['apply_proces_en'] = apply_proces_en
        item['ielts'] = ielts
        item['ielts_w'] = ielts_w
        item['ielts_r'] = ielts_r
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['rntry_requirements'] = rntry_requirements
        item['apply_documents_en'] = apply_documents_en
        item['url'] = url
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['location'] = location
        yield  item

示例#27

显示文件

文件： TeessideUniversity_u.py 项目： histudent/python_spider

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Teesside University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/text()'
        ).extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en).strip()
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/span'
        ).extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name).strip()
        if '(Hons)' in degree_name:
            degree_name = degree_name.replace('(Hons)', '').strip()
        # print(degree_name)

        #6.overview_en
        overview_en = response.xpath(
            '//*[@id="tab1"]/div/div[1]/div').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #7.modules_en
        modules_en = response.xpath(
            '//*[@id="tab2"]/div[1]/div/div[1]/div[1]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        print(modules_en)

        #8.assessment_en
        assessment_en = response.xpath(
            '//*[@id="tab2"]/div[1]/div/div[1]/div[3]/p').extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #9.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Career opportunities')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #10.ucascode
        ucascode = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[2]/p/text()'
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = clear_space_str(ucascode)
        try:
            ucascode = ucascode[:4]
        except:
            ucascode = 'N/A'
        # print(ucascode)

        #11.department
        department = response.xpath(
            '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[3]/a/p'
        ).extract()
        department = ''.join(department)
        department = remove_tags(department)
        department = department.replace('&amp; ', '')
        # print(department)

        #12.duration
        duration = response.xpath(
            '//*[@id="courseinfopdf"]/div[1]/ul/li[1]').extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        # print(duration)

        #13.tuition_fee
        tuition_fee = 11825

        #14.apply_desc_en
        apply_desc_en = response.xpath(
            '//*[@id="tab3"]/div/div[1]/div').extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)

        #15.start_date
        start_date = '2018-10-13'

        #16.tuition_fee_pre
        tuition_fee_pre = '£'

        #17.other
        other = 'https://www.tees.ac.uk/sections/international/fees.cfm'

        #18.require_chinese_en
        require_chinese_en = '<p>For entry onto a Foundation or Extended programme, applicants require:  Huikao (Chinese senior secondary school graduation certificate) Successful completion of the first two years of Senior Secondary School with a minimum average of 70% or successful completion of Senior Secondary School with a minimum average of 60% For entry onto an Undergraduate programme, applicants require:  For entry onto Year 1:Huikao (Chinese senior secondary school graduation certificate) Successful completion of Senior Secondary School with a minimum average of 80% Or Gaokao (Chinese university or college entrance exam) with a minimum score of 500 For entry onto Higher National Diploma: Gaokao with a minimum score of 450 For entry onto Integrated Master of Engineering – MEng (Hons): Gaokao with a minimum score of 550 For entry onto Undergraduate top-up programmes (third-year entry) Dazhuan (three-year college graduation diploma) with a minimum of 70% average or, SQA Higher National Diploma with BBC as minimum or, Edexcel Higher National Diploma – standard UK entry requirements or, UK accredited foundation degree</p>'

        #19.ielts,20212223
        if 'Dental Hygiene and Dental Therapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Diagnostic Radiography' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Midwifery' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Physiotherapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Occupational Therapy' in degree_name:
            ielts = 7.0
            ielts_r = 6.5
            ielts_w = 6.5
            ielts_l = 6.5
            ielts_s = 6.5
        elif 'Nursing Studies' in degree_name:
            ielts = 7.0
            ielts_r = 7.0
            ielts_w = 7.0
            ielts_l = 7.0
            ielts_s = 7.0
        else:
            ielts = 6
            ielts_r = 5.5
            ielts_w = 5.5
            ielts_l = 5.5
            ielts_s = 5.5
        if 'Fine Art' in programme_en:
            ielts = 5.5
        elif 'Design' in programme_en:
            ielts = 5.5
        elif 'Media Production' in programme_en:
            ielts = 5.5
        elif 'Engineering' in programme_en:
            ielts = 5.5
        elif 'Science' in programme_en:
            ielts = 5.5
        elif 'Computing' in programme_en:
            ielts = 5.5
        elif 'Media Studies' in programme_en:
            ielts = 5.5
        elif 'Journalism' in programme_en:
            ielts = 5.5
        elif 'Business' in programme_en:
            ielts = 6.0
        elif 'English' in programme_en:
            ielts = 6.0
        elif 'Sport' in programme_en:
            ielts = 6.0
        elif 'History' in programme_en:
            ielts = 6.0
        elif 'Psychology' in programme_en:
            ielts = 6.0
        elif ' Criminology' in programme_en:
            ielts = 6.0
        elif 'Sociology' in programme_en:
            ielts = 6.0
        elif 'Youth Studies' in programme_en:
            ielts = 6.0
        elif 'Education' in programme_en:
            ielts = 6.0
        elif 'Law' in programme_en:
            ielts = 6.0
        elif 'Crime' in programme_en:
            ielts = 6.0
        elif 'Investigation' in programme_en:
            ielts = 6.0
        elif 'Health' in programme_en:
            ielts = 7.0
        else:
            ielts = 6.0
        # print(ielts,ielts_w,ielts_l,ielts_r,ielts_s)
        #24.apply_pre
        apply_pre = '£'

        #25.alevel
        # alevel = response.xpath('')
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['ucascode'] = ucascode
        item['department'] = department
        item['duration'] = duration
        item['tuition_fee'] = tuition_fee
        item['apply_desc_en'] = apply_desc_en
        item['start_date'] = start_date
        item['other'] = other
        item['tuition_fee_pre'] = tuition_fee_pre
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['require_chinese_en'] = require_chinese_en
        item['apply_pre'] = apply_pre

示例#28

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'University of Sunderland'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.degree_type
        degree_type = 1

        #4.degree_name
        degree_name = response.xpath(
            '/html/body/div[2]/header/div/div[1]/h1/span[1]').extract()
        degree_name = ''.join(degree_name)
        degree_name = remove_tags(degree_name)
        if '(Hons)' in degree_name:
            degree_name = degree_name.replace('(Hons)', '').strip()
        # print(degree_name,response.url)

        #5.programme_en
        programme_en = response.xpath(
            '/html/body/div[2]/header/div/div[1]/h1/text()').extract()
        programme_en = ''.join(programme_en)
        programme_en = clear_space_str(programme_en).strip()
        # print(programme_en)

        #6.duration
        duration = response.xpath(
            '/html/body/div[2]/header/aside/div/ul/li[1]/span').extract()
        duration = ''.join(duration)
        duration = remove_tags(duration)
        if 'Full/Part' in duration:
            duration = ''
        # print(duration,'*(*&*(&(*',duration_per)

        # 8.start_date
        start_date_list = response.xpath(
            "//*[contains(text(),'Next start date')]//*").extract()
        start_date_list = ''.join(start_date_list)
        start_date_list = remove_tags(start_date_list)
        # start_date = tracslateDate(start_date)
        try:
            start_date = re.findall('\d+', start_date_list)[0]
        except:
            start_date = ''
        if 'Oct' in start_date_list:
            start_date = '2018-10-' + str(start_date)
        elif 'Aug' in start_date_list:
            start_date = '2018-8-' + str(start_date)
        elif 'Sep' in start_date_list:
            start_date = '2018-9-' + str(start_date)
        elif 'Jan' in start_date_list:
            start_date = '2018-11-' + str(start_date)
        elif 'Nov' in start_date_list:
            start_date = '2019-1-' + str(start_date)
        else:
            start_date = ''
        # print(start_date_list,start_date)

        #9.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),' International fee')]//*|//*[contains(text(),'Tuition fee')]//*"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        if ',' in tuition_fee:
            tuition_fee = getTuition_fee(tuition_fee)
        else:
            try:
                tuition_fee = re.findall('\d+', tuition_fee)[0]
            except:
                tuition_fee = None
        # print(tuition_fee)

        #10.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS code')]/..").extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode).replace('UCAS code', '').strip()
        if len(ucascode) > 50:
            ucascode = ucascode[:50]
        # print(ucascode)

        #11.tuition_fee_pre
        tuition_fee_pre = '£'

        #12.overview_en
        overview_en = response.xpath(
            "//*[contains(text(),'Overview')]//following-sibling::p").extract(
            )
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #13.modules_en
        modules_en = response.xpath('//*[@id="course-years"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #14.apply_desc_en
        apply_desc_en = response.xpath(
            "//*[contains(text(),'Entry requirements')]//following-sibling::*"
        ).extract()
        apply_desc_en = ''.join(apply_desc_en)
        apply_desc_en = remove_class(apply_desc_en)
        # print(apply_desc_en)

        #15.ielts 16171819
        ielts = 6.0
        ielts_r = 5.5
        ielts_w = 5.5
        ielts_s = 5.5
        ielts_l = 5.5
        # print(ielts,ielts_r,ielts_s,ielts_l,ielts_w)

        #20.career_en
        career_en = response.xpath(
            "//*[contains(text(),'Employment')]//following-sibling::*"
        ).extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #21.apply_pre
        apply_pre = '£'

        #22.apply_proces_en
        apply_proces_en = '<p>When you are ready to make your application, there are six ways of applying. Choose the option which is the most convenient for you: Option 1: The Universitys overseas offices If you would like to apply to a course, you can contact one of our overseas offices to start the application process.Our overseas offices are able to answer any questions you may have about studying in the United Kingdom. Contact one of our offices in China, Malaysia, India, Vietnam or Greece to begin your application.Option 2: UCAS – undergraduate onlyInternational and UK students apply for undergraduate courses in the same way – through the Universities and Colleges Admissions Service (UCAS) website.The UCAS institution code for the University is S84.Option 3: Apply online – postgraduate only To study at postgraduate level, you need to apply to the University of Sunderland directly.Find the postgraduate course you’re interested in, and on the course page you will see a link to either apply online or download an application form (.pdf).Option 4: Email your application directly To apply for undergraduate and postgraduate courses at the main University of Sunderland campuses, email your completed application form (.pdf) to [email protected]. To apply for undergraduate and postgraduate courses at the University of Sunderland in London, email your completed application form (.pdf) to [email protected]. Option 5: In-country representatives If you live outside the UK, make your application by finding the most convenient contact from our in-country representatives.Once you have completed and submitted your application, you will be given a unique Personal ID number so you can be kept up-to-date with any developments in your application process. Option 6: Apply through a study centre To study with us in your country, through one of the University of Sunderlands study centres, you must apply directly through the relevant study centre. Visit the Other ways to study with us page to discover where you can study.</p>'

        #23.alevel
        alevel = response.xpath(
            '//*[@id="fees-and-reqs"]/div[1]/p[3]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        # print(alevel)

        item['alevel'] = alevel
        item['apply_proces_en'] = apply_proces_en
        item['university'] = university
        item['url'] = url
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['programme_en'] = programme_en
        item['duration'] = duration
        item['start_date'] = start_date
        item['tuition_fee'] = tuition_fee
        item['ucascode'] = ucascode
        item['tuition_fee_pre'] = tuition_fee_pre
        item['overview_en'] = overview_en
        item['modules_en'] = modules_en
        item['apply_desc_en'] = apply_desc_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['career_en'] = career_en
        yield item

示例#29

显示文件

    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)
        #1.university
        university = 'Queen Mary University of London'


        #2.location
        location = 'London'


        #3.department
        department = response.xpath('//*[@id="count"]/article/div/aside/p[3]/a[1]').extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #4.programme_en
        programme_en = response.xpath('//*[@id="count"]/article/header/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_class(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #5.degree_type
        degree_type = 2

        #6.degree_name  7.duration  8.duration_per
        try:
            degree_name = response.xpath('//*[@id="count"]/article/header/h2').extract()
            degree_name = ''.join(degree_name)
            degree_name = remove_tags(degree_name)
            #print(degree_name)
            duration = re.findall('\(.*\)',degree_name)
            duration = ''.join(duration)
            duration = duration.replace('(','')
            duration = duration.replace(')','')
            if 'months' in duration:
                duration = re.findall('\d',duration)[0]
                duration_per = 3
            else:
                duration = re.findall('\d',duration)[0]
                duration_per = 1
            # print('duration:',duration)
            # print('duration_per:',duration_per)
            if duration in degree_name:
                degree_name = degree_name.replace(duration,'')
                degree_name = degree_name.replace('(','')
                degree_name = degree_name.replace(')', '')
                degree_name = degree_name.split()[0]
            else:
                degree_name = 'N/A'
            # print(degree_name)
        except:
            degree_name = 'N/A'
            duration = None
            duration_per = 1

        #9.overview_en
        overview_en = response.xpath('//*[@id="first"]').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        start = overview_en.find('Overview')
        end = overview_en.find('Why study')
        overview_en= overview_en[start: end]
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #10.teach_time
        teach_time = 'full time'

        #11.modules_en
        try:
            modules_en = response.xpath('//*[@id="second"]').extract()
            modules_en = ''.join(modules_en)
            modules_en = remove_class(modules_en)
            if 'For more information contact' in modules_en:
                start = modules_en.find('Structure')
                end = modules_en.find('For more information contact')
                modules_en = modules_en[start:end]
                modules_en = clear_space_str(modules_en)
            else:
                modules_en = modules_en
            # print(modules_en)
        except:
            modules_en = 'N/A'

        #12.assessment_en
        try:
            assessment_en = response.xpath('//*[@id="fourth"]').extract()
            assessment_en = ''.join(assessment_en)
            assessment_en = remove_class(assessment_en)
            assessment_en = clear_space_str(assessment_en)
            # print(assessment_en)
        except:
            assessment_en = 'N/A'

        #13.career_en
        try:
            career_en = response.xpath('//*[@id="sixth"]').extract()
            career_en = ''.join(career_en)
            career_en = remove_class(career_en)
            career_en = clear_space_str(career_en)
            # print(career_en)
        except:
            career_en = 'N/A'

        #14.tuition_fee
        try:
            tuition_fee1 = response.xpath('//*[@id="fifth"]/p[2]/text()').extract()
            tuition_fee1 = ''.join(tuition_fee1)
            tuition_fee1 = remove_tags(tuition_fee1)
            tuition_fee1 = re.findall('\d{1,3},\d{3}', tuition_fee1)
            if tuition_fee1 == []:
                tuition_fee = response.xpath('//*[@id="fifth"]/p[1]/text()').extract()
                tuition_fee = ''.join(tuition_fee)
                tuition_fee = remove_tags(tuition_fee)
                tuition_fee = re.findall('\d{1,3},\d{3}', tuition_fee)[0]
                # print(tuition_fee)
            else:
                tuition_fee = tuition_fee1[0]
            tuition_fee = tuition_fee.replace(',','')
            # print(tuition_fee)
        except:
            tuition_fee = 0

        #15.tuition_fee_pre
        tuition_fee_pre = '£'

        #16.entry_requirements
        try:
            entry_requirements_list = response.xpath('//*[@id="third"]').extract()
            entry_requirements_list = ''.join(entry_requirements_list)
            entry_requirements_list = remove_class(entry_requirements_list)
            # entry_requirements_list = remove_tags(entry_requirements_list)
            if 'International applicants' in entry_requirements_list:
                start = entry_requirements_list.find('Entry requirements')
                mid = entry_requirements_list.find('International applicants')
                end = entry_requirements_list.find('For more information')
                entry_requirements = entry_requirements_list[start:mid]
                other = entry_requirements_list[mid:end]
            else:
                entry_requirements = entry_requirements_list
                other = 'N/A'
            if 'mso-fareast-language:EN-US' in entry_requirements:
                start = entry_requirements.findall('Entry requirements')
                end = 'Normal'
                entry_requirements = entry_requirements[start:end]
            else:
                entry_requirements = entry_requirements
            entry_requirements = clear_space_str(entry_requirements)
            other = clear_space_str(other)
            # print(entry_requirements)
            #print(other)
        except:
            entry_requirements = 'N/A'
            other = 'N/A'

        #17.雅思
        if department == 'School of Business and Management':
            ielts=7
            ielts_l=5.5
            ielts_s=5.5
            ielts_r=5.5
            ielts_w=6
            toefl=100
            toefl_l=17
            toefl_s=20
            toefl_r=18
            toefl_w=21
        elif department =='School of English and Drama':
            ielts = 7
            ielts_l = 7
            ielts_s = 7
            ielts_r = 7
            ielts_w = 7
            toefl = 100
            toefl_l = 22
            toefl_s = 25
            toefl_r = 24
            toefl_w = 27
        elif department =='School of Geography':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department=='School of History':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department =='School of Languages, Linguistics and Film':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 7
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 27
        elif department=='School of Law':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 7
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        elif department =='School of Politics and International Relations':
            ielts = 7
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6.5
            toefl = 100
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 24
        else:
            ielts = 6.5
            ielts_l = 5.5
            ielts_s = 5.5
            ielts_r = 5.5
            ielts_w = 6
            toefl = 92
            toefl_l = 17
            toefl_s = 20
            toefl_r = 18
            toefl_w = 21
        # print(ielts,ielts_l,ielts_r,ielts_s,ielts_w)

        url = response.url
        apply_documents_en = 'You must provide the following supporting documentation: Completed application form  Degree transcripts. Please provide a transcript of your degree(s). If you have not yet completed your degree please provide a transcript of your results achieved to date  If your degree was from a UK university, please upload a transcript of your marks for each year If your degree was from an overseas institution, you should supply a transcript of your marks for each year of your studies and a copy of your degree certificate together with a certified translation if the document is not in English. Please note that original documentation will be required before you enrol. International applicants are also advised to include high school transcripts Please provide the contact details of two referees on your application, at least one reference must be from an academic referee who is in a position to comment on the standard of your academic work and suitability for postgraduate level study. Where appropriate, a second referee can provide comment on your professional experience. Your academic referee(s) may already have provided you with a reference that you can use to support any application for study or research that you make. We call these ‘open’ references. Open references will normally only be accepted if they are written on headed paper, provided as a colour copy of the original, and provide the referee’s work contact details. If you have open references, please upload these at the time of application If you do not have open reference, we will contact your referee(s) via email to supply a reference, preferably electronically. Please note, we can only accept references provided by email if it is sent from a university or company email address. References from a personal email address such as Yahoo or Hotmail are not acceptable. Your referee(s) can also supply a paper reference in response to the reference request email your referee will receive. Paper reference forms should be endorsed by an appropriate institution/company stamp or on official institution/company letterhead, and should be provided as a scanned colour copy of the original. Curriculum Vitae (CV)/ Resume This list of documents may vary slightly from course to course, so you will need to check the guidance notes and academic school website for the programme that you are applying for.  Although not mandatory, you are encouraged to send in the following documents in support of your application:  Statement of purpose  Your statement of purpose should explain why you want to study your chosen programme and how it will help your life and career. This should typically be one side of A4 paper. IELTS/TOEFL certificate (if applicable) International applicants should provide evidence of English language ability: IELTS, TOEFL, or other acceptable proof. Please see the international students section for more details.'
        require_chinese_en = 'Taught degrees (MSc/MA: one year) For entry onto our masters level courses students should normally have achieved: Four-year bachelors degree from 211 or 985 University with 75%+ average Four-year bachelors degree from non-211 University within top 300 with average 80%+ The usual entrance requirement to a taught masters degree is a four-year bachelors degree from a 211 University. However, all applications are considered on an individual basis and students may be admitted to masters programmes with a lower level degree if they have work experience relevant to the degree applied for. Students with a three-year diploma (dazhuan) from a recognised institution may apply for the Pre-Masters Graduate Diploma, a year-long course which will gain them access to a masters programme.Research degree (MPhil/PhD: three years) For entry onto our research degree courses students should normally have a masters degree from a recognised university.'
        apply_fee = 0
        apply_pre = '£'
        item['apply_fee']  = apply_fee
        item['apply_pre']  = apply_pre
        item['require_chinese_en'] = require_chinese_en
        item['apply_documents_en'] = apply_documents_en
        item['university'] = university
        item['location'] = location
        item['department'] = department
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['overview_en'] = overview_en
        item['teach_time'] = teach_time
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['rntry_requirements'] = entry_requirements
        item['ielts'] = ielts
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['toefl'] = toefl
        item['toefl_l'] = toefl_l
        item['toefl_s'] = toefl_s
        item['toefl_r'] = toefl_r
        item['toefl_w'] = toefl_w
        item['url'] = url
        yield item

示例#30

显示文件

文件： UniversityOfBristol_P.py 项目： histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 授课方式
        # item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath("//h1[@id='pagetitle']/span//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower():
                item['teach_type'] = "phd"
                if "research" in item['degree_name'].lower():
                    item['teach_type'] += " " + "research"
                item['degree_type'] = 3
            elif "research" in item['degree_name'].lower():
                item['teach_type'] = "research"
                item['degree_type'] = 3
            else:
                item['teach_type'] = "taught"
                item['degree_type'] = 2
            # print("item['degree_type']: ", item['degree_type'])
            # print("item['teach_type']: ", item['teach_type'])

            # duration
            duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract()
            # print("location = ", location)
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract()
            clear_space(startdate)
            print("startdate = ", startdate)
            if len(startdate) > 0:
                # item['start_date'] = startdate[-1].strip()
                # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(''.join(startdate[-1]))
            print("item['start_date'] = ", item['start_date'])

            # deadline
            deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(''.join(deadline))
            # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract()
            clear_space(department)
            # print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath("//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            # overview  //div[@id='programme-overview']//text()
            overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # tuitionFee   //div[@id='fees']
            tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract()
            clear_space(tuitionFee)
            print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip())

            if item['tuition_fee'] is None:
                tuitionFee1 = response.xpath(
                    "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract()
                clear_space(tuitionFee1)
                print("tuitionFee1 = ", tuitionFee1)
                if len(tuitionFee1) > 0:
                    item['tuition_fee_pre'] = "£"
                    item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1))
                if item['tuition_fee'] == 0:
                    item['tuition_fee_pre'] = ""
                    item['tuition_fee'] = None
            if item['tuition_fee'] is None:
                print("tuition_fee 为空")
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # modules   //div[@id='programme-structure']
            modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            elif "Profile" not in item['ielts_desc']:
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # 就业    //div[@id='careers']
            career = response.xpath("//div[@id='careers']").extract()
            # print("department = ", department)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2>
<p>You should&nbsp;<a href="/pg-howtoapply/">apply online</a>&nbsp;for all our postgraduate programmes.</p>
<p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor&rsquo;s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p>
<ul>
<li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li>
<li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # http://www.bristol.ac.uk/study/postgraduate/apply/
            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p>
<p>You can use our online admissions system to:</p>
<ul>
<li>submit all your application details securely online and view your completed application form;</li>
<li>upload supporting documents;</li>
<li>request references electronically;</li>
<li>track the progress of your application;</li>
<li>receive a decision on your application online;</li>
<li>update your contact details (it is important you tell us if you change your home address or email);</li>
<li>receive useful information about the University and your application.</li>
</ul>
<p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract()
            item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])
            yield item
        except Exception as e:
            print("异常：", str(e))
            print("报错链接：", response.url)
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")