Exemplos de remove_class em Python, exemplos de scrapySchool_England_Ben.remove_tags.remove_class em Python

Exemplo n.º 1

0

Exibir arquivo

    def parseAssessCareer(self, assessCareerUrl):
        data = requests.get(assessCareerUrl, headers=self.headers)
        response = etree.HTML(data.text)
        assessCareerDict = {}
        assessment = response.xpath(
            "//h2[contains(text(),'How is this course taught and assessed?')]|//h2[contains(text(),'How is this course taught and assessed?')]/following-sibling::*[position()<6]"
        )
        assessmentStr = ""
        if len(assessment) > 0:
            for ass in assessment:
                assessmentStr += etree.tostring(ass,
                                                encoding='unicode',
                                                pretty_print=False,
                                                method='html')
        assessmentStr = remove_class(clear_space_str(assessmentStr))
        assessCareerDict['assessment_en'] = assessmentStr

        career = response.xpath(
            "//h2[contains(text(),'What are my career prospects?')]|//h2[contains(text(),'What are my career prospects?')]/following-sibling::p"
        )
        careerStr = ""
        if len(career) > 0:
            for ass in career:
                careerStr += etree.tostring(ass,
                                            encoding='unicode',
                                            pretty_print=False,
                                            method='html')
                careerStr = remove_class(clear_space_str(careerStr))
        assessCareerDict['career_en'] = careerStr
        return assessCareerDict

Exemplo n.º 2

0

Exibir arquivo

 def get_modules2(self, modules2url):
     data = requests.get(modules2url, headers=self.headers)
     response = etree.HTML(data.text)
     modules2 = response.xpath("/html/body/div[@class='container']")
     m2 = etree.tostring(modules2[0],
                         encoding='unicode',
                         pretty_print=False,
                         method='html')
     m2 = remove_class(clear_space_str(m2))
     return m2

Exemplo n.º 3

0

Exibir arquivo

 def parse_modules_en(self, modulesUrl):
     data = requests.get(modulesUrl, headers=self.headers)
     response = etree.HTML(data.text)
     modules1 = response.xpath("//a[@class='active']//text()")
     modules2 = response.xpath("//table[@class='table-basic']")
     m2 = ""
     if len(modules2) > 0:
         m2 = "<h2>" + ''.join(modules1) + "</h2>"
         m2 += etree.tostring(modules2[0],
                              encoding='unicode',
                              pretty_print=False,
                              method='html')
     m2 = remove_class(clear_space_str(m2))
     y2 = response.xpath(
         "//a[@class='active']/../following-sibling::li[1]/a/@href")
     return [m2, y2]

Exemplo n.º 4

0

Exibir arquivo

 def parse_apply_proces_en(self, how_to_apply_url):
     data = requests.get(how_to_apply_url, headers=self.headers)
     response = etree.HTML(data.text)
     # print(response)
     apply_proces_en = response.xpath(
         "//div[@class='layout-row intro summary']")
     # 将Element转换成HTML格式
     apply = ""
     if len(apply_proces_en) > 0:
         apply = etree.tostring(apply_proces_en[0],
                                encoding='unicode',
                                pretty_print=False,
                                method='html')
     else:
         apply = how_to_apply_url
     apply = remove_class(clear_space_str(apply))
     return apply

Exemplo n.º 5

0

Exibir arquivo

Arquivo: UniversityOfChichester_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chichester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University of Chichester, College Lane, Chichester, West Sussex, PO19 6PE'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath(
                "//div[@class='breadcrumb']//a[2]//text()").extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            programmeDegreetype = response.xpath(
                "//div[@class='field-items accordion-content']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''
            if len(programmeDegreetype) > 0:
                programmeDegreetypeStr = programmeDegreetype[0].strip()
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)

            degree_type = re.findall(
                r"^\w+/\w+|^\w+.*/\s\w+|^\w+\s\(Hons\)|^\w+/\w+\s\(Hons\)|^\w+",
                programmeDegreetypeStr, re.I)
            degree_name_str = ''.join(degree_type).strip()
            item['degree_name'] = degree_name_str.replace(
                "(Hons)", "").replace("(HONS)", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegreetypeStr.replace(degree_name_str, '')
            item['programme_en'] = ''.join(programme).replace(
                "(Hons)", "").title().strip().strip('-').strip()
            print("item['programme_en']: ", item['programme_en'])

            ucascode = response.xpath(
                "//p[contains(text(),'UCAS ')]//text()").extract()
            clear_space(ucascode)
            # print("ucascode: ", ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace("UCAS",
                                                                "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//*[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            entry_requirements = response.xpath(
                "//section//div[@class='field field-name-field-main-content field-type-text-long field-label-hidden']//div[@class='field-items accordion-content']//h2[contains(text(), 'Entry')]/..//text()|"
                "//section//div[@class='field field-name-field-main-content field-type-text-long field-label-hidden']//div[@class='field-items accordion-content']//h2[contains(text(), 'ENTRY')]/..//text()"
            ).extract()
            # print("==", entry_requirements)
            rntry_requirements = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]/text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            overview_en = response.xpath(
                "//span[contains(text(),'Course content')]/../..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            career_en = response.xpath(
                "//span[contains(text(),'Where this can take you')]/../.."
            ).extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//span[contains(text(),'Indicative modules')]/../..").extract(
                )
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//span[contains(text(),'Teaching and assessment')]/../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # interview_desc_en = response.xpath("//span[contains(text(),'Teaching and assessment')]/../..").extract()
            # item['interview_desc_en'] = remove_class(clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            duration_url = response.xpath(
                "//iframe[@id='unistats-widget-frame']/@src").extract()
            clear_space(duration_url)
            print("duration_url: ", duration_url)
            if len(duration_url) > 0:
                data = etree.HTML(
                    requests.get(duration_url[0], headers=self.headers).text)
                duration = data.xpath(
                    "//p[contains(text(),'Full time')]//text()")
                clear_space(duration)
                print("duration: ", duration)
                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            item['ielts'] = 6.0
            item['ielts_l'] = 5.5
            item['ielts_s'] = 5.5
            item['ielts_r'] = 5.5
            item['ielts_w'] = 5.5

            # https://www.chi.ac.uk/study-us/fees-finance/tuition-fees
            item['tuition_fee'] = 13000
            item[
                'require_chinese_en'] = """<p>Senior Secondary School Certificate PLUS an International Foundation</p>
<p>Year OR Senior Secondary School Certificate 80% +</p>"""
            # https://www.chi.ac.uk/international/how-apply/undergraduate-applications
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="col-2-3">
      
              <h1 class="title" id="page-title">Undergraduate Applications</h1>
      
      
        <div class="region region-content">
    <div id="block-system-main" class="block block-system">

    
  <div class="content">
    <div id="node-2130" class="node node-content-page node-readydeploy clearfix" about="/international/how-apply/undergraduate-applications" typeof="sioc:Item foaf:Document">
  
  <div class="content">
    <div class="field field-name-field-serif-intro field-type-text-long field-label-hidden"><div class="field-items"><div class="field-item even"><p>We recommend you apply online through  <a target="_blank" href="http://www.ucas.com/apply">UCAS</a>. International students may also apply directly to the University using the University of Chichester <a target="_blank" href="https://d3mcbia3evjswv.cloudfront.net/files/International%20Application%202017-2018_1_0.doc?dt7VFSaSnVZlYb1a1vvKmEfvuuqVmsqE">International Application Form.</a></p>
</div></div></div><div class="field field-name-body field-type-text-with-summary field-label-hidden"><div class="field-items"><div class="field-item even" property="content:encoded"><h3><span class="rangySelectionBoundary" id="selectionBoundary_1424440432166_7907926958422292" style="line-height: 0; display: none;"></span><strong>Applying via UCAS</strong></h3>
<p>You can apply for up to five different degree courses at up to five different institutions through UCAS (the national Universities and Colleges Admissions Service). Your application is sent to all five  universities which you have applied to at the same time.  There is no need to choose a first choice university at this stage.</p>
<p>Each course has a UCAS code that you will find on our <a href="/search/course-search">Course pages </a>or in our prospectus. You will need to know the UCAS code for the course you want to apply for when you make your application.</p>
<p><strong>Deadlines and important dates</strong></p>
<ul><li>1 September UCAS opens for applications for courses starting in September/October the following year.</li>
<li>15 January - Recommended application date for UK and other EU applicants.</li>
<li>30 June - Closing date for international (non-EU) applicants. (We do advise you to apply earlier if possible though.)</li>
<li>July / August - applications can still be submitted via UCAS but you can only apply to one university at a time in July and August (known as "Clearing")</li>
</ul><p>When you are applying to UCAS you will also need the UCAS institution code for the university. The UCAS code for the University of Chichester is <strong>(CHICH) C58</strong>.</p>
<p>Need further information or guidance on applying?</p>
<p>Then please either contact Admissions on +44 (0)1243 816002 or email <a href="mailto:[email protected]?subject=International%20application">[email protected]</a></p>
<h4><strong>Accepting an offer of a place</strong></h4>
<p>Your university offer(s) will be notified to you via your UCAS account and you can select a first ("firm") choice and, if you wish, a second ("insurance") choice via UCAS who will then inform the universities of your decision.</p>
<p><strong>Tuition fee deposit</strong></p>
<p>If you wish to accept an offer from the University of Chichester, you will be expected to pay a deposit of £2,000 before a UKVI Certificate of Acceptance for Studies (CAS) will be issued to you.</p>
<p>The deposit will be refunded, in full, if the University withdraws the programme.</p>
<p>Otherwise, the deposit will only be refunded, minus a £250 administration charge, if the applicant provides written evidence of being refused a visa to join the programme, through no fault of his or her own. Where the applicant has not disclosed relevant previous study, or does not have sufficient funds in the bank account for the relevant period, are examples of where it would be deemed the applicant's responsibility for not securing a visa.</p>
</div></div></div></div></div></div></div></div></div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            if "/" in item['ucascode']:
                if len(item['ucascode']) > 20:
                    item['ucascode'] = ""
                print("///////////////")
                print("item['ucascode']1: ", item['ucascode'])
                ucascode_0 = item['ucascode'].split("/")
                if "/" in item['degree_name']:
                    degree_name_0 = item['degree_name'].split("/")
                else:
                    degree_name_0 = [item['degree_name'], item['degree_name']]
                for u in range(len(ucascode_0)):
                    item['ucascode'] = ucascode_0[u]
                    item['degree_name'] = degree_name_0[u]
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 6

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            programme_en = ''.join(programme).strip()

            degree_name = re.findall(r"^.*?\s", programme_en)
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programme_en.replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@id='proxy_rightSummary']//p//span[contains(text(),'College:')]/../text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            ucascode = response.xpath(
                "//span[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//span[contains(text(),'Duration:')]/../text()").extract()
            clear_space(duration)
            # print("duration: ", duration)

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            # location = response.xpath(
            #     "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            # clear_space(location)
            item[
                'location'] = '33 Buccleuch Place, City, Edinburgh, Post Code. EH8 9JS'
            # print("item['location']: ", item['location'])

            # # //option[@value='0010']
            # start_date = response.xpath(
            #     "//select[@name='code2']//option//text()").extract()
            # clear_space(start_date)
            # if len(start_date) > 1:
            #     item['start_date'] = start_date[0].strip()
            # # print("item['start_date']: ", item['start_date'])
            # item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_introduction']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='proxy_collapseprogramme']
            modules = response.xpath(
                "//div[@id='proxy_collapseWhatStudy']/..").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(
                list(modules)))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@id='proxy_collapseLearning']/..").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='proxy_collapseCareers']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            # entry_requirements = response.xpath(
            #     "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//li[contains(text(),'A Levels:')]//text()|//p[contains(text(),'A levels:')]//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[-1]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//ul[1]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()").extract()
            ib = response.xpath(
                "//html//ul[3]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            IELTS = response.xpath(
                "//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath(
                "//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            if len(TOEFL) == 0:
                TOEFL = response.xpath(
                    "//*[contains(text(),'TOEFL')]//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath(
                "//html//div[@id='proxy_collapseFees']//p[1]/a/@href").extract(
                )
            print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee']: ", item['tuition_fee'])

            # https://www.ed.ac.uk/studying/international/country/asia/east-asia/china
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p class="lead">Undergraduate entry requirements for students from China.</p>


  <h2>Senior High School Certificate</h2>

<p>Students who have completed the Chinese Senior High School Certificate are required to undertake further study for entry to most subjects as this qualification does not normally meet our minimum entry requirements.</p>

<p>We accept the following qualifications for direct entry to our undergraduate degree programmes:</p>

<ul>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/ruk/a-levels" title="A Levels"><abbr title="General Certificate of Education">GCE</abbr> <abbr title="Advanced Level">A Levels</abbr></a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/international/ib" >International Baccalaureate</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/scottish-qualifications/highers" title="SQA Highers and Advanced Highers">Scottish qualifications</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/international/country/americas/united-states-of-america" title="United States of America"><abbr title="United States">US</abbr> qualifications</a></li>
</ul>

<p>Applicants with qualifications other than those listed above will usually be required to complete a Foundation Year before entering the University.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/applying/foundation" title="International Foundation Programme">Foundation year</a></p>

<h2>Science and Engineering</h2>

<p>For degree programmes in Science and Engineering, applicants who have completed a year of study at a leading Chinese University may be eligible to apply.</p>

<p>The College of Science &amp; Engineering will also give consideration to applicants who have achieved excellent results in the Chinese National University Entrance Examination (Gaokao) on an individual basis.</p>

<h2>Further guidance on academic entry requirements</h2>

<p>Each course may have further specific entry requirements. All applicants must meet these requirements. Staff in the Admissions Offices will be able to provide further guidance.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/contacts" title="Contact us with an enquiry about undergraduate study">Undergraduate admissions contacts</a></p>

<h2>English Language requirements</h2>

<p>If your first language is not English, you will also have to meet English Language requirements to apply. These requirements are listed by programme.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/english" title="English language requirements">English Language advice</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/degrees" title="Degree finder">Specific English language requirement by programme</a></p>

<h2>Contact us</h2>

<p>Edinburgh Global's representative for China is Esther Sum.</p>

<p>Esther will help you with admissions advice and support.</p>

<p><a href="mailto:[email protected]">Contact us by email - [email protected]</a></p>

<h2>Support in your country</h2>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/our-visits-overseas" title="Our visits overseas">View a list of our overseas visits</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/agents/list/china" title="China">Our agents in your country</a></p>

<h2>Chat to us</h2>

<p>Talk to a member of staff online and view a presentation about study in Edinburgh.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/chat-to-us-online" title="Online information sessions">Chat to us</a></p>

<h2>Join our mailing list</h2>

<p>We will send you further useful information about the University, admissions and entry.</p>

<p><a href="http://r1.dotmailer-surveys.com/0127judf-2e1gig1f">Join our mailing list </a></p>

<h2>About Edinburgh</h2>

<p><a class="uoe-node-link uoe-published" href="/about" title="About">More information about Edinburgh</a></p>

<p><a class="uoe-node-link uoe-published" href="/global/immigration/applying-for-visa/visa-requirements" >Do I need a visa?</a></p>

<h2>Student numbers</h2>

<p>There are almost 3,000 students students from China currently studying at the University of Edinburgh.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "https://www.ed.ac.uk/studying/undergraduate/applying"

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 7

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Newcastle University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.ncl.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Newcastle University, NE1 7RU, United Kingdom"
        print("===========================")
        print(response.url)
        try:
            # 专业
            programmeDegree_type = response.xpath(
                "//header[@class='pageTitle']/h1//text()").extract()
            clear_space(programmeDegree_type)
            programmeDegree_type = ''.join(programmeDegree_type).strip()
            print("programmeDegree_type: ", programmeDegree_type)

            # degree_typeList = re.findall(r"\w+\sHonours$|\w+\sHonours.*", programmeDegree_type)
            degree_name = response.xpath("//strong[contains(text(),'Degree Awarded')]/../text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programmeDegree_type.replace(item['degree_name'], "").strip()
            print("item['programme_en']: ", item['programme_en'])

            ucascode = response.xpath("//strong[contains(text(),'UCAS Code')]/../text()").extract()
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            durationMode = response.xpath(
                "//strong[contains(text(),'Course Duration')]/../text()").extract()
            if len(durationMode) == 0:
                durationMode = response.xpath(
                    "//p[@class='duration summary']/text()").extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            alevel_ib = response.xpath(
                "//strong[contains(text(),'Entry Requirements')]/../text()").extract()
            clear_space(alevel_ib)
            # print("alevel_ib: ", alevel_ib)
            if len(alevel_ib)==2:
                item['alevel'] = alevel_ib[0]
                item['ib'] = alevel_ib[-1]
            # if item['alevel'] == "":
            #     print("alevel 为空")
            # print("item['alevel']: ", item['alevel'])
            # if item['ib'] == "":
            #     print("ib 为空")
            # print("item['ib']: ", item['ib'])

            # //html//div[@class='contentSeparator textEditorArea expandable']//p[1]/a
            department = response.xpath(
                "//html//div[@class='contentSeparator textEditorArea expandable']//p[1]/a//text()").extract()
            if len(department) == 0:
                department = response.xpath("//*[contains(text(), 'School of')]/text()|//*[contains(text(), 'Faculty of')]/text()").extract()
            # print(department)
            department_str = ';'.join(department).strip()
            # print(department_str)
            dep = re.findall(r"School\sof[a-zA-Z\s,]+|Faculty\sof[a-zA-Z\s,]+", department_str)
            # print("dep: ", dep)
            if len(dep) > 0:
                for d in dep:
                    if "Faculty" in d:
                        item['department'] = d.replace("Graduate School", "").strip()
                        # print("长度1： ", len(item['department']))
                        if len(item['department']) > 55:
                            continue
                        else:
                            break
                    else:
                        item['department'] = dep[0]
                        # print("长度： ", len(item['department']))
                        if len(item['department']) > 55:
                            item['department'] = dep[-1]
            # print("item['department']: ", item['department'])


            # 页面全部内容
            allcontent = response.xpath(
                "//main[@id='content']//article//text()").extract()
            # clear_space(allcontent)
            # print("allcontent：", allcontent)

            department = re.findall(r"Newcastle\sUniversity\sBusiness\sSchool", ''.join(allcontent))
            # print("department: ", department)
            if len(department) > 0 and item['department'] == "":
                item['department'] = department[0]
            # print("==item['department']: ", item['department'])


            # //h3[contains(text(),'Highlights of this degree')]/../preceding-sibling::*[1]
            overview_en = response.xpath("//h3[contains(text(),'Highlights of this degree')]/../preceding-sibling::*[@class='contentSeparator containAsides textEditorArea'][1]|"
                                         "//h3[contains(text(),'Highlights of this degree')]/../following-sibling::*[@class='contentSeparator containAsides textEditorArea'][1]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # if item['overview_en'] == "":
            #     print("overview_en 为空")
            # print("item['overview_en']: ", item['overview_en'])

            # //h3[contains(text(),'Teaching and assessment')]/../../preceding-sibling::*
            modules_en = response.xpath("//h3[contains(text(),'Teaching and assessment')]/../../preceding-sibling::*[position()<10]").extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//h2[contains(text(),'Course Details')]/../preceding-sibling::*[1]/following-sibling::*[position()<10]").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            rem_modules = re.findall(r"<div><h3>Related Degrees.*|<div>Next step:.*|<div><h3>Compare this course.*|<figure><iframe allowfullscreen></iframe></figure>", item['modules_en'])
            if len(rem_modules) > 0:
                for m in rem_modules:
                    item['modules_en'] = item['modules_en'].replace(m, '').strip()
            item['modules_en'] = item['modules_en'].replace("<div></div>", "").strip()
            item['modules_en'] = ''.join(item['modules_en'].split('\n')).strip()
            # if item['modules_en'] == "":
            #     print("modules_en 为空")
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching and assessment')]/../..").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
                # print("assessment_en 为空")
            # print("item['assessment_en']: ", item['assessment_en'])

            apply_desc_en = response.xpath(
                "//h2[contains(text(),'Entry Requirements')]/../preceding-sibling::*[1]/following-sibling::*[position()<20]").extract()
            item['apply_desc_en'] = ''.join(remove_class(clear_lianxu_space(apply_desc_en)).split('\n')).strip()
            # if item['apply_desc_en'] == "":
            #     print("apply_desc_en 为空")
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            portfolio_desc_en = response.xpath(
                "//h3[contains(text(),'Portfolio requirements')]/../..").extract()
            item['portfolio_desc_en'] = remove_class(clear_lianxu_space(portfolio_desc_en))
            # if item['portfolio_desc_en'] == "":
            #     print("portfolio_desc_en 为空")
            # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])


            # //h3[contains(text(),'English')]/../..//*[contains(text(),'IELT')]
            ielts_desc = response.xpath("//h3[contains(text(),'English')]/../..//*[contains(text(),'IELT')]/text()").extract()
            clear_space(ielts_desc)
            if len(ielts_desc) > 0:
                item['ielts_desc'] = ielts_desc[0].strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            # print(ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) > 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


            career_en = response.xpath("//h2[contains(text(),'Careers')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            if item['career_en'] == "<div><h2>Careers</h2><div></div></div>":
                item['career_en'] = remove_class(clear_lianxu_space(response.xpath("//h2[contains(text(),'Careers')]/../preceding-sibling::*[1]/following-sibling::*[position()<4]").extract()))
            # if item['career_en'] == "":
            #     print("career_en 为空")
            # print("item['career_en']: ", item['career_en'])


            tuition_fee = response.xpath("//h3[contains(text(),'Tuition Fees (International students)')]/../..//text()").extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//p[contains(text(),'September 2018 start (4 terms) ')]//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //h2[contains(text(),'Apply')]/../following-sibling::*
            apply_proces_en = response.xpath(
                "//h2[contains(text(),'Apply')]/../preceding-sibling::*[1]/following-sibling::*").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            if item['apply_proces_en'] == "":
                print("apply_proces_en 为空")
            print("item['apply_proces_en']: ", item['apply_proces_en'])


            # chinese_requirements
            item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<div class="contentSeparator textEditorArea expandable collapsed"><a href="javascript:void(0)" class="toggle-wrapper" name="d.en.666526" data-widget-type="expandable"><h4 class="expandable-is-set">Undergraduate entry requirements</h4><span href="javascript:void(0)" class="toggle expandable-is-set">Undergraduate entry requirements</span></a>
  <div class="answer">
  
  <figure class="widget-aux right"></figure>
    
  <p>Typically we recognise a <strong>70% - 75% average</strong> in <strong>one year of Bachelor degree</strong> study at <strong>Project 211</strong> and <strong>Netbig Top 150</strong><a href="http://rank2011.netbig.com/en/%20"> </a><strong>institutions</strong>&nbsp;and a <strong>75% - 80% average</strong> at <strong>other institutions</strong>&nbsp;as comparable to ABB at A level.</p>
<p>You may be considered for first year entry with a&nbsp;<strong>pass</strong> from a <strong>university foundation programme in China </strong>at a <strong>recognised institution with</strong><em>&nbsp;</em>at least an<strong> 80% average and 80% in relevant subjects</strong>.</p>
<p>We know that different institutions use different grading scales. These may vary from our stated entry requirements. Please <strong>include the institution&rsquo;s marking scale</strong> if it is not stated on your transcript.</p>
<p>Please check course specific&nbsp;<strong>entry requirements</strong>&nbsp;for&nbsp;<a href="/undergraduate/degrees/">undergraduate degrees</a>&nbsp;as they do&nbsp;<strong>vary across courses</strong>, with ABB typically the minimum required.</p>
<p>On our course pages, you can also find out <a href="/undergraduate/apply/">how to apply</a>.</p>
  
  </div>
</div>"""]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])


            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: UniversityOfCambridge_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Cambridge"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'Fitzwilliam House, 32 Trumpington Street, Cambridge, CB2 1QY'
        print("===============================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='campl-sub-title']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            ucascode = response.xpath(
                "//div[contains(text(),'UCAS Code')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode']: ", item['ucascode'])

            duration_degree_name = response.xpath(
                "//div[contains(text(),'Course Duration')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(duration_degree_name)
            print("duration_degree_name: ", duration_degree_name)

            if len(duration_degree_name) > 0:
                if "-" in ''.join(duration_degree_name).strip():
                    duration_degree_name_list = ''.join(
                        duration_degree_name).split("-")
                    item['degree_name'] = duration_degree_name_list[-1].strip()
                elif "–" in ''.join(duration_degree_name).strip():
                    duration_degree_name_list = ''.join(
                        duration_degree_name).split("–")
                    item['degree_name'] = duration_degree_name_list[-1].strip()

                duration_list = getIntDuration(''.join(duration_degree_name))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
            print("item['degree_name'] = ", item['degree_name'])
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # //div[contains(text(),'Colleges')]/following-sibling::div[1]
            department = response.xpath(
                "//div[contains(@class,'field-items')]/div[1]/div[1]/li[1]/span[1]//a//text()"
            ).extract()
            clear_space(department)
            if len(department) > 0:
                item['department'] = ''.join(department[0]).strip()
            if item['department'] == "Course website":
                item['department'] == ""
            if "Faculty of Modern and Medieval Languages" in item[
                    'department']:
                item['department'] = "Faculty of Modern and Medieval Languages"
            print("item['department'] = ", item['department'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath("//fieldset[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            modules_en = response.xpath(
                "//fieldset[@id='course-outline']|//table[contains(@class,'campl-table-bordered campl-table campl-vertical-stacking-table')]"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en'] = ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//a[@id='assessment']/../preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                "//fieldset[@id='entry-requirements']//h2[contains(text(),'assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<8]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            apply_desc_en = response.xpath(
                "//fieldset[@id='entry-requirements']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            entry_requirements = response.xpath(
                "//h2[contains(text(),'Typical offers require')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(entry_requirements)
            # print("entry_requirements: ", entry_requirements)
            if "IB:" in entry_requirements:
                ibIndex = entry_requirements.index("IB:")
                item['alevel'] = ''.join(entry_requirements[:ibIndex]).strip()
                item['ib'] = ''.join(entry_requirements[ibIndex:]).strip()
            # print("item['alevel'] = ", item['alevel'])
            # print("item['ib'] = ", item['ib'])
            if item['alevel'] == "":
                al = response.xpath(
                    "//fieldset[@id='entry-requirements']//*[contains(text(),'A Level')]//text()"
                ).extract()
                clear_space(al)
                if len(al) > 0:
                    item['alevel'] = al[0].strip()
                # print("item['alevel']2 = ", item['alevel'])

            career_en = response.xpath(
                "//div[@class='fieldset-wrapper']//div[@class='field field-name-body field-type-text-with-summary field-label-hidden']//div[@class='field-item even']/h2[last()]//preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en'] = ", item['career_en'])

            # https://www.undergraduate.study.cam.ac.uk/applying
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-item even"><p>If you want to apply to the University, you do so through UCAS. However, at Cambridge the process starts earlier to allow time for all of the application information to be gathered and considered. This section guides you through the process and explains what we’re looking for in prospective students and how we assess applications.</p>
<h1>1. Choose a course</h1>
<p>You’re going to be studying to a very high level for several years so make sure you <a href="https://www.undergraduate.study.cam.ac.uk/courses/">choose a course</a> you’re personally interested in and will really enjoy studying! Check, also, that you meet the <a href="https://www.undergraduate.study.cam.ac.uk/applying/entrance-requirements">entrance requirements</a> of the course you want to study.</p>
<h1>2. Choose a College</h1>
<p>Where would you like to live when you’re here? In your UCAS application, indicate if you have a preference <a href="https://www.undergraduate.study.cam.ac.uk/colleges">College</a> or if you’re <a href="https://www.undergraduate.study.cam.ac.uk/colleges/open-applications">making an open application</a>.</p>
<h1>3. Apply</h1>
<h3>UCAS application</h3>
<p>Submit your <a href="https://www.undergraduate.study.cam.ac.uk/applying/ucas-application">UCAS application</a> by <strong>15 October</strong> – our institution code is CAM C05.</p>
<p>Other application deadlines apply for those wishing to be interviewed in <a href="https://www.undergraduate.study.cam.ac.uk/international-students/overseas-interviews">overseas countries</a>, and for some <a href="https://www.undergraduate.study.cam.ac.uk/applying/mature-students-and-second-undergraduate-degrees/mature-student-applications">mature applicants</a>.</p>
<p>There's an additional application form if you're applying for the <a href="https://www.undergraduate.study.cam.ac.uk/courses/medicine-graduate-course">Graduate Course in Medicine</a>.</p>
<h3>Supplementary Application Questionnaire (SAQ)</h3>
<p>Shortly after submitting the UCAS application, you'll be asked (via email) to complete the <a href="https://www.undergraduate.study.cam.ac.uk/applying/saq">Supplementary Application Questionnaire (SAQ)</a> – a few extra questions requesting information not included in your UCAS application, which we find helpful. To make a valid application to the University of Cambridge, you must submit your SAQ by the deadline set. In the majority of cases this deadline will be 6.00pm (UK time) on 22 October 2018.</p>
<h3>Cambridge Online Preliminary Application (COPA)</h3>
<p>If you're living or attending school/college outside the EU and/or applying for an <a href="https://www.undergraduate.study.cam.ac.uk/finance/music-awards/organ-scholarships">Organ Scholarship</a>, you need to submit the <a href="https://www.undergraduate.study.cam.ac.uk/applying/copa">Cambridge Online Preliminary Application (COPA)</a>, and the deadline for submitting this may be earlier than 15 October (see the relevant page for information).</p>
<h3>Transcripts</h3>
<p>You may be required to submit an <a href="https://www.undergraduate.study.cam.ac.uk/applying/transcripts">academic transcript</a>.</p>
<h1>4. Written assessment</h1>
<p>Most applicants are required to take a <a href="https://www.undergraduate.study.cam.ac.uk/applying/admission-assessments">written admission assessment</a>, either pre-interview or at interview (if interviewed).</p>
<h1>5. Interview</h1>
<p>Everyone with a realistic chance of being offered a place is invited to attend an <a href="https://www.undergraduate.study.cam.ac.uk/applying/interviews">interview</a>. That’s around 75 per cent of applicants each year.</p>
<h1>6. Decision</h1>
<p>We’ll advise you of our <a href="https://www.undergraduate.study.cam.ac.uk/applying/decisions">decision</a> before the end of January.</p>
</div>"""
                ]))
            item['deadline'] = '2018-10-15'
            # item["application_open_date"] = '2018-11-30'

            tuition_fee_dict = {
                "Anglo-Saxon, Norse, and Celtic": 20157,
                "Archaeology": 20157,
                "Asian and Middle Eastern Studies": 20157,
                "Classics": 20157,
                "Economics": 20157,
                "Education": 20157,
                "English": 20157,
                "History": 20157,
                "History of Art": 20157,
                "History and Modern Languages": 20157,
                "History and Politics": 20157,
                "Human, Social, and Political Sciences": 20157,
                "Land Economy": 20157,
                "Law": 20157,
                "Linguistics": 20157,
                "Modern and Medieval Languages": 20157,
                "Philosophy": 20157,
                "Theology, Religion, and Philosophy of Religion": 20157,
                "Mathematics": 22482,
                "Architecture": 26376,
                "Geography": 26376,
                "Music": 26376,
                "Chemical Engineering": 30678,
                "Computer Science": 30678,
                "Engineering": 30678,
                "Management Studies (Part II course)": 30678,
                "Manufacturing Engineering (Part II course)": 30678,
                "Natural Sciences": 30678,
                "Psychological and Behavioural Sciences": 30678,
                "Veterinary Medicine": 52638,
                "Medicine (Graduate Course)": 70131,
                "Medicine": 70131,
            }
            item['tuition_fee'] = tuition_fee_dict.get(item['programme_en'])
            if item['tuition_fee'] is not None:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # https://www.undergraduate.study.cam.ac.uk/international-students/english-language-requirements
            item['ielts'] = '7.5'
            item['ielts_l'] = '7.0'
            item['ielts_s'] = '7.0'
            item['ielts_r'] = '7.0'
            item['ielts_w'] = '7.0'
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            item['toefl'] = '110'
            item['toefl_l'] = '25'
            item['toefl_s'] = '25'
            item['toefl_r'] = '25'
            item['toefl_w'] = '25'
            # print("item['toefl'] = %sitem['toefl_l'] = %sitem['toefl_s'] = %sitem['toefl_r'] = %sitem['toefl_w'] = %s==" % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
<p>The Gaokao is regarded as suitable preparation for Cambridge. The Gaokao scores of successful applicants will vary from province to province and year to year. As a guideline, successful applicants will usually have scores in the top 0.1% of those taking the Gaokao in their province. In addition to the total score, Cambridge Colleges will pay close attention to individual subject scores and scores in the Senior High School Examinations (Xueye Shuiping Kaoshi; previously the Huikao). The Xueye Shuiping Kaoshi alone are not regarded as suitable preparation for Cambridge.</p>
<p>Applicants studying for the Gaokao are encouraged to undertake additional study outside of their school qualifications. This might include, for example, relevant science Olympiads or College Board SAT I or II; or Advanced Placement Tests.</p>
<p>Gaokao offers are made on an individual basis, and we recommend that you <a href="https://www.undergraduate.study.cam.ac.uk/colleges/college-contacts">contact the College</a> to which you wish to apply for further advice and guidance.</p>"""
                ]))

            # item['apply_fee'] = apply_fee_re1[0].replace("£", "").strip()
            # item["apply_pre"] = "£"
            # print("item['apply_fee'] = ", item['apply_fee'])
            # print("item['apply_pre'] = ", item['apply_pre'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 9

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            # //dt[contains(text(),'School')]/following-sibling::dd[1]
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/..|//h2[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), 'Year')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath(
                "//span[contains(text(),'A Level')]/..//text()|//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # print("item['alevel']: ", item['alevel'])
            # if item['alevel'] == "":
            #     print("****alevel")

            ib = response.xpath(
                "//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            # if item['ib'] == "":
            #     print("****ib")

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]

            ieltsList = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio and interview')]/..").extract(
                )
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])
            # if item['interview_desc_en'] == "":
            #     print("****interview_desc_en")

            portfolio_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio')]/..").extract()
            item['portfolio_desc_en'] = remove_class(
                clear_lianxu_space(portfolio_desc_en))
            # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])
            # if item['portfolio_desc_en'] == "":
            #     print("****portfolio_desc_en")

            # https://www.bathspa.ac.uk/international/country-advice/china/

            item[
                'require_chinese_en'] = "<p><strong>Undergraduate</strong></p><ul><li>Senior Secondary School Graduation Certificate with a grade of 70% and a Foundation Certification from a recognised institution.</li></ul><p><strong>Undergraduate - Year 2 or 3 entry</strong></p><ul><li>Students with a Dazhuan Certificate will be considered for Year 3 entry on an individual basis.&nbsp;</li></ul>"

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="intro-text">
	<p class="intro">We’re delighted you’re applying to study with us. The process is different based on your location and mode of study. Here’s what you need to do.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1281"></a>'></div>
    <div>
        <h2>UCAS applicants</h2>
<p>If you fit the following criteria, you’ll need to apply through the Universities and Colleges Admissions Service (UCAS):</p>
<ul>
<li>You’re applying directly out of sixth form or college;</li>
<li>You want to study full-time;</li>
<li>You don’t already hold an undergraduate qualification and are from the UK, EU or Channel Islands.</li>
</ul>
<p><strong>The official UCAS deadline for 2018/19 applications to any course: 15 January 2018.</strong></p>
<p>You’ll need some information from your course's webpage, including Bath Spa University’s institution code: BASPA B20.</p>
<p>Read more about <a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">how to </a><a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">apply through UCAS</a> or just get started. You’ll need to register or login to the UCAS site. &nbsp;</p>
<p><a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">Apply via UCAS</a></p>
<h2>International applicants</h2>
<p>You can apply for one of our undergraduate courses online from the course’s webpage.&nbsp;You’ll be asked to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Alternatively, you can also <a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">apply via UCAS</a>.</p>
<p>Entry requirements are listed on the course pages. As part of the process you will be required to provide evidence to support your application.&nbsp;Please see our <a href="/international/">international</a> webpages for more information for international students, including entry requirements and visa advice specific to your country.</p>
<p><a href="/courses/">Search for your course</a></p>
<h2>Applying for part-time study</h2>
<p>If you’d like to study part-time, you’ll need to apply online directly with us, rather than through UCAS. &nbsp;</p>
<p><strong>Click the 'apply now' button on the webpage for the course you’d like to study.</strong></p>
<h2>Already hold an undergraduate degree?</h2>
<p>If you already have a degree or higher qualification than that for which you are applying, your fee requirements may be different, due to the way government University funding is distributed. Please check the Equivalent or Lower Qualification (ELQ) policy&nbsp;for more details.<br><br>This also applies to students who progress to the third year of study, following completion of a Foundation Degree. Please note that Foundation Degrees are currently exempt from higher fees.</p>
    </div>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            ucascode = response.xpath(
                "//dd[contains(text(),'Course Code:')]//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("Course Code:",
                                                         "").strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(durationMode.strip())
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']: ", item['duration'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            print("item['other']: ", item['other'])

            if "or" in item['ucascode']:
                ucascode_list1 = item['ucascode'].split("or")
                print("ucascode_list1", ucascode_list1)

                # 拆分duration
                if ", or" in item['other']:
                    duration_list1 = item['other'].split(", or")
                else:
                    duration_list1 = [item['other'], item['other']]
                print("duration_list1: ", duration_list1)
                for u in range(len(ucascode_list1)):
                    item['ucascode'] = ucascode_list1[u].strip()
                    duration_list = getIntDuration(duration_list1[u].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    print("item['duration']: ", item['duration'])
                    print("item['duration_per']: ", item['duration_per'])
                    # 分为两种情况，第一种正常采集，第二种为带实习的专业
                    if u == 0:
                        overview = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']1: ", item['overview_en'])

                        assessment_en = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                        ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']1: ", item['assessment_en'])
                    elif u == 1:
                        overview = response.xpath(
                            """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                            if len(overview) == 0:
                                overview = response.xpath(
                                    """//h3[contains(text(),'Overview')]/..|//h3[contains(text(),'overview')]/.."""
                                ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']2: ", item['overview_en'])

                        assessment_en = response.xpath(
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'Assessment')]/.."
                        ).extract()
                        if len(assessment_en) == 0:
                            assessment_en = response.xpath(
                                "//h3[contains(text(),'How will I be assessed?')]/..|"
                                "//h3[contains(text(),'How will I be taught?')]/..|"
                                "//h3[contains(text(),'Assessment')]/.."
                            ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']2: ", item['assessment_en'])
                    yield item
            else:
                overview = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                    ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']1: ", item['overview_en'])

                assessment_en = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']1: ", item['assessment_en'])
                yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)


#            department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
# "business and management":"Bath Business School",
# "business and management (accounting)":"Bath Business School",
# "business and management (entrepreneurship)":"Bath Business School",
# "business and management (international business)":"Bath Business School",
# "business and management (marketing)":"Bath Business School",
# "curatorial practice":"Bath School of Art and Design",
# "design (ceramics)":"Bath School of Art and Design",
# "design (fashion and textiles)":"Bath School of Art and Design",
# "fine art":"Bath School of Art and Design",
# "visual communication":"Bath School of Art and Design",
# "children's publishing":"College of Liberal Arts",
# "classical acting":"College of Liberal Arts",
# "composition":"College of Liberal Arts",
# "creative producing":"College of Liberal Arts",
# "creative writing":"College of Liberal Arts",
# "creative writing phd":"College of Liberal Arts",
# "crime and gothic fictions":"College of Liberal Arts",
# "dance":"College of Liberal Arts",
# "directing":"College of Liberal Arts",
# "directing circus":"College of Liberal Arts",
# "environmental humanities":"College of Liberal Arts",
# "environmental management":"College of Liberal Arts",
# "feature filmmaking":"College of Liberal Arts",
# "heritage management":"College of Liberal Arts",
# "intercultural musicology":"College of Liberal Arts",
# "liberal arts":"College of Liberal Arts",
# "literature, landscape and environment":"College of Liberal Arts",
# "music performance":"College of Liberal Arts",
# "performing shakespeare":"College of Liberal Arts",
# "principles of applied neuropsychology":"College of Liberal Arts",
# "scriptwriting":"College of Liberal Arts",
# "songwriting (campus based)":"College of Liberal Arts",
# "songwriting (distance learning)":"College of Liberal Arts",
# "sound (arts)":"College of Liberal Arts",
# "sound (design)":"College of Liberal Arts",
# "sound (production)":"College of Liberal Arts",
# "theatre for young audiences":"College of Liberal Arts",
# "transnational writing":"College of Liberal Arts",
# "travel and nature writing":"College of Liberal Arts",
# "writing for young people":"College of Liberal Arts",
# "counselling and psychotherapy practice":"Institute for Education",
# "education (education studies)":"Institute for Education",
# "education (early childhood studies)":"Institute for Education",
# "education (international education)":"Institute for Education",
# "education (leadership and management)":"Institute for Education",
# "inclusive education":"Institute for Education",
# "professional practice":"Institute for Education",
# "professional practice in higher education":"Institute for Education",
# "teaching english to speakers of other languages":"Institute for Education",
# "specific learning difficulties / dyslexia":"Institute for Education",
# "national award for special educational needs coordination":"Institute for Education",
# "professional doctorate in education":"Institute for Education",
# }

Exemplo n.º 10

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.plymouth.ac.uk/"
        item['university'] = "University of Plymouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        print(response.url)
        try:
            # //span[@class='course-title']
            programme = response.xpath(
                "//span[@class='course-title']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@class='hero-heading']/text()").extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            ucascode = response.xpath("//td[contains(text(),'UCAS course code')]/following-sibling::td//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            department = response.xpath(
                "//h2[@class='school-title']//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            # print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath("//td[contains(text(),'Duration')]/following-sibling::td//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # location
            location = response.xpath("//td[contains(text(),'Location')]/following-sibling::td//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            # overview
            overview1 = response.xpath("//div[@class='overview']|//div[@id='key-features-accordion']").extract()
            # overview2 = response.xpath("//div[@id='key-features-accordion']").extract()
            # overview = remove_class(clear_lianxu_space(overview1)) + remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = remove_class(clear_lianxu_space(overview1))
            # if item['overview_en'] == '':
            #     print("***overview")
            # print("item['overview_en'] = ", item['overview_en'])

            # modules
            modules = response.xpath("//div[@id='structure-accordion']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath("//div[contains(@id, 'career')]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en'] = ", item['career_en'])

            # entry_requirements
            entry_requirements = response.xpath("//div[@id='entry-requirements-accordion']//text()").extract()
            clear_space(entry_requirements)
            entry_requirements_str = ''.join(entry_requirements)
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # .{1,150}IELTS.{1,150}
            IELTS = re.findall(r"IELT.{1,80}|ILETS.{1,80}", entry_requirements_str)
            print("IELTS: ", IELTS)
            if len(IELTS) != 0:
                ielts = ''.join(list(IELTS[0])).strip()
                item['ielts_desc'] = ielts
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip(".").strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip(".").strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip(".").strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip(".").strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip(".").strip()
            print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # alevel = response.xpath("//div[@id='entry-requirements-accordion']//*[contains(text(),'A Level')]/..//text()|"
            #                         "//div[@id='entry-requirements-accordion']//*[contains(text(),'A level')]//text()").extract()
            alevel = re.findall(r"A.*Level.{1,100}|A.*level.{1,100}", entry_requirements_str)
            clear_space(alevel)
            item['alevel'] = ''.join(alevel).strip()
            if item['alevel'] == '':
                print("***alevel")
            print("item['alevel'] = ", item['alevel'])
            if len(item['alevel']) > 300:
                item['alevel'] = ''.join(item['alevel'][:301])
            print("item['alevel']1 = ", item['alevel'])


            # ib = response.xpath("//div[@id='entry-requirements-accordion']//b[contains(text(),'International Baccalaureate')]/..//text()|"
            #                     "//div[@id='entry-requirements-accordion']//b[contains(text(),'IB')]/..//text()|"
            #                     "//div[@id='entry-requirements-accordion']//b[contains(text(),'International baccalaureate')]/..//text()").extract()
            # if len(ib) == 0:
            #     ib = response.xpath("//div[@id='entry-requirements-accordion']//b[contains(text(),'IB')]/../following-sibling::*[1]//text()").extract()
            #     if len(ib) == 0:
            ib = re.findall(r"IB.{1,100}|International.*Baccalaureate.{1,100}|International.*baccalaureate.{1,100}", entry_requirements_str)
            clear_space(ib)
            item['ib'] = ''.join(ib).strip()
            if item['ib'] == '':
                print("***ib")
            print("item['ib'] = ", item['ib'])

            # how_to_apply
            how_to_apply = response.xpath("//div[@id='how-to-apply-accordion']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //html//div[@class='course-accordions']//tr[3]/td[3]
            # how_to_apply
            tuition_fee = response.xpath("//strong[contains(text(),'International')]/../following-sibling::*[2]//text()|"
                                         "//div[@id='fees-funding-accordion']//table[1]//td//text()").extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            if len(tuition_fee)>0:
                item['tuition_fee'] = getTuition_fee(tuition_fee_str)
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']1 = ", item['tuition_fee'])
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # https://www.plymouth.ac.uk/international/study/international-students-country-guides/asia/china
            item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<p><b>Undergraduate</b></p><p>To apply for our undergraduate courses you'll need good grades in your&nbsp;
高中毕业证书
 Senior High School Graduation Examination and the&nbsp;
高考
 Chinese University Entrance Examination (Gaokao) for admission to year 1.&nbsp;</p><p>
Applicants who have completed the 专科毕业证书 &nbsp;Graduation Certificate - Specialist / Sub-degree (Zuanke) level (also known as the &nbsp;大专 Dazhuan) will be considered for final year (top-up) entry. We generally require an overall 70 per cent grade or above but this will vary depending on the institution.&nbsp;Contact us for more information: <a href="mailto:[email protected]">[email protected]</a>
</p><p>If you're a high school leaver, our partner college on campus, <a href="http://www.plymouth.ac.uk/international/plymouth-university-international-college">Plymouth University International College (PUIC)</a>, offers a wide variety of foundation courses. &nbsp;
</p><p><div class="table-responsive">
  <table class="table align-left">
      <tr>
            <td><b>A level</b> </td>
            <td> <b>UCAS tariff</b> </td>
            <td><b>Gaokao - percentage</b>&nbsp;</td>
            <td><b>Gaokao - overall grade</b>&nbsp;</td>
            <td><b>Gaokao - GPA</b>&nbsp;</td>
            <td> </td>
      </tr>
      <tr>
            <td> AAA </td>
            <td> 144 </td>
            <td> 90 – 100 </td>
            <td> A </td>
            <td> 4.0 </td>
            <td> </td>
      </tr>
      <tr>
            <td> BBB </td>
            <td> 120 </td>
            <td> 78 – 81 </td>
            <td> B </td>
            <td> 3.0 </td>
            <td> </td>
      </tr>
      <tr>
            <td> CCC </td>
            <td> 96 </td>
            <td> 70 – 71 </td>
            <td> C </td>
            <td> 2.0 </td>
            <td> </td>
      </tr>
</table>"""]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 11

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace(
                "Award:", "").replace("(Hons)", "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(
                item['degree_name'], '').replace("(Hons)",
                                                 "").replace("()", "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Undergraduate entry to year 1 </strong></p>
<p>The above qualifications and completion of a suitable foundation programme.</p>
<p>Alternatively, successful completion of one year at a recognised Chinese university in a relevant subject.</p>
<p><strong>Undergraduate entry to year 2 / 3</strong></p>
<p>2 Year Diploma in a suitable subject area.</p>
<p>University College Graduation Diploma or Graduation Diploma from recognised institutions.</p>
<p>EDEXCEL or SQA HND</p>
<p>Da Zhuan (3 Year Diploma)</p>
<p>(Year 2 &amp; 3 entry is subject to successful programme mapping)</p>"""
                ]))

            ucascode = response.xpath(
                "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
            ).extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS code:",
                                                         "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            teach_time = ''.join(mode)
            print("teach_time: ", teach_time)

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "full" in teach_time.lower():
                if "https://courses.bolton.ac.uk/course" in item['url']:
                    if "undergraduate" in isup_str or len(
                            item['ucascode']) != 0:
                        print("******存到数据库*****")
                        yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 12

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "http://www.swansea.ac.uk/"
        item['university'] = "Swansea University"
        item['url'] = response.url
        item['degree_type'] = 1
        item['location'] = "Singleton Park, Swansea, SA2 8PP, Wales, UK"
        print("===============================")
        print(response.url)
        try:
            # ucas_code
            ucascode = response.xpath(
                "//div[@class='top-button-ucas-code']/div[@class='top-button-value']//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # 专业、学位类型
            courseDegreeaward = response.xpath(
                "//h1[@class='content-header-heading']//text()").extract()
            courseDegreeawardStr = ''.join(courseDegreeaward).strip()
            print(courseDegreeawardStr)
            if len(courseDegreeawardStr) != 0:
                d = re.findall(
                    r"^(\w+\s/\w+\s/\w+)|^(\w+/\w+/\w+)|(^\w+\s\(\w+\))|^(\w+/\s\w+)|^(\w+)",
                    courseDegreeawardStr)
                if len(d) != 0:
                    degree_type = ''.join(list(d)[0])
                    # print(degree_type)
                    item['degree_name'] = degree_type
                    programme = courseDegreeawardStr.split(degree_type)
                    item['programme_en'] = ''.join(programme).strip()
            item['programme_en'] = item['programme_en'].replace(
                item['ucascode'], "").strip()
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            # //ul[@style='width: 5000px;']/li[4]
            department = response.xpath(
                "//div[@class='breadCrumb module']//ul/li[4]//text()").extract(
                )
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath(
                "//table[@class='top-button-course-variants-table']//tr/td//text()|"
                "//div[@class='top-button-duration']/div[@class='top-button-duration-value']/text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # mode
            mode = response.xpath(
                "//table[@class='top-button-course-variants-table']//tr/td//text()|//div[@class='top-button-duration']/div[@class='top-button-duration-value']/small/text()"
            ).extract()
            clear_space(mode)
            # item['mode'] = ''.join(mode).strip()
            # print("item['mode'] = ", item['mode'])

            # 专业描述
            overview1 = response.xpath(
                "//div[@id='content-items']/div[@class='layout-article-items']/div[@class='title-and-body-text']"
            ).extract()
            overview2 = response.xpath("//div[@id='key-features']").extract()
            overview3 = response.xpath("//div[@id='description']").extract()
            # clear_space(overview1)
            # clear_space(overview2)
            # clear_space(overview3)
            overview1 = remove_class(clear_lianxu_space(overview1))
            overview2 = remove_class(clear_lianxu_space(overview2))
            overview3 = remove_class(clear_lianxu_space(overview3))
            # overview = '\n'.join(overview1).strip() + "\n" + '\n'.join(overview2).strip() + "\n" +  '\n'.join(overview3).strip()
            overview = overview1 + "\n" + overview2 + "\n" + overview3
            item['overview_en'] = overview
            # print("item['overview_en'] = ", item['overview_en'])

            # 课程设置
            modules = response.xpath("//div[@id='modules']").extract()
            # //div[@id='course-structure-']
            modules1 = response.xpath(
                "//div[@id='course-structure-']").extract()
            # print(modules1)
            clear_space(modules)
            modulesEnd = re.findall(r"\(function\s\(\)\s{.*",
                                    '\n'.join(modules).strip())
            # print(modulesEnd)
            clear_space(modules1)
            modules = remove_class(clear_lianxu_space(modules)).replace(
                ''.join(modulesEnd), '').strip()
            item['modules_en'] = modules + remove_class(
                clear_lianxu_space(modules1))
            # print("item['modules_en'] = ", item['modules_en'])

            # IELTS
            entryRequirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # clear_space(entryRequirements)
            entryRequirements = clear_lianxu_space(entryRequirements)
            # item['entry_requirements'] = entryRequirements.strip()
            # print("item['entry_requirements'] = ", item['entry_requirements'])
            entryRequirementsStr = ''.join(entryRequirements)
            # print("entryRequirementsStr = ", entryRequirementsStr)

            alevel_re = response.xpath(
                "//span[@class='top-button-grades-required-value-postgraduate']//text()"
            ).extract()
            clear_space(alevel_re)
            item['alevel'] = ''.join(alevel_re).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IB')]/text()|"
                "//div[@id='entry-requirements']//*[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            # print("ib: ", ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()
            # print("item['ib']1 = ", item['ib'])

            if item['ib'] == "":
                ibStart1 = entryRequirementsStr.find(
                    "International Baccalaureate")
                # if ibStart1 == -1:
                #     ibStart1 = entryRequirementsStr.find("IB")
                ibEnd = entryRequirementsStr.find("BTEC")
                if ibEnd == -1:
                    ibEnd = entryRequirementsStr.find("Welsh")
                ib = entryRequirementsStr[ibStart1:ibEnd - 1]
                item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            pat = r".{0,50}IELTS.{0,50}"
            re_ielts = re.compile(pat)
            ielts = re_ielts.findall(entryRequirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltlsrw = re.findall(r"\d\.\d", item['ielts_desc'])
            if len(ieltlsrw) >= 2:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[1]
                item['ielts_s'] = ieltlsrw[1]
                item['ielts_r'] = ieltlsrw[1]
                item['ielts_w'] = ieltlsrw[1]
            elif len(ieltlsrw) == 1:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[0]
                item['ielts_s'] = ieltlsrw[0]
                item['ielts_r'] = ieltlsrw[0]
                item['ielts_w'] = ieltlsrw[0]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # 学费
            # fee = html.xpath("//div[@id='tuition-fees-contents']/div[@class='table-wrapper']/table[@class='expander-item-fees-table']/tbody/tr[@class='expander-item-fees-table-row odd']/td[@class='expander-item-fees-table-data odd'][2]//text()")
            tuition_fee = response.xpath(
                "//div[@id='tuition-fees-contents']//table[@class='expander-item-fees-table']/tbody/tr[1]/td[4]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # //div[@id='how-to-apply']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            assessment_en = response.xpath(
                "//a[contains(text(),'assessment')]/../..").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='careers-and-employability']|//div[@id='careers-employability']|//div[@id='employabilitycareers']|"
                "//div[@id='employability-and-careers-']|//div[@id='careers-in-child-nursing-']|//div[@id='careers']"
                "|//div[@id='graduate-employability-and-careers']|//div[@id='careers-in-radiotherapy-physics']|//div[@id='careers-in-midwifery']|"
                "//div[@id='careers-in-neurophysiology-']|//div[@id='careers-in-psychology-']|//div[@id='careers-in-adult-nursing-']|"
                "//a[contains(text(),'Careers')]/../..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            item[
                'deadline'] = "http://www.swansea.ac.uk/undergraduate/apply/application-process/applying-for-2018/"
            # item['interview_desc_en'] = "http://www.swansea.ac.uk/undergraduate/apply/application-process/interviews/"
            item[
                'require_chinese_en'] = """<p><strong>Undergraduate Programmes:&nbsp;<br /></strong>Candidates are expected to have achieved a <span>Senior High School Graduation Diploma plus 1 year in a recognised Higher Education Institution (with 60% pass mark)</span>, including an&nbsp;IELTS 6.0 with 5.5 in each part of the test (or&nbsp;equivalent).</p>"""
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: UniversityOfBradford_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bradford"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        # if "ug/" in response.url:
        print("===========================")
        print(response.url)
        # 重定向匹配不了
        item['major_type1'] = response.meta.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        item['location'] = 'Bradford West Yorkshire BD7 1DP UK'
        # print("item['location']: ", item['location'])
        try:
            key_url = response.url.split("/")[-2].strip()

            programme = response.xpath(
                "//div[@id='course-key-info']//div[@class='col-xs-12']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath("//p[@id='cAward']//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            mode = response.xpath(
                "//option[@value='fulltime']//text()|//span[@id='cAttendance']//text()|//span[@id='displayYear']//text()"
            ).extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            print("mode: ", mode)

            if "full" in ''.join(
                    mode).lower() and "Foundation" not in item['programme_en']:
                overview_en = response.xpath(
                    "//div[@id='overviewStripe']").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                modules = response.xpath(
                    "//div[@id='course-curriculum']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//div[@class='row stripe background--green']").extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en = response.xpath(
                    "//div[@id='nav-course-career']").extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(career_en)).replace("<div></div>",
                                                           "").strip()
                # print("item['career_en']: ", item['career_en'])

                item[
                    'apply_proces_en'] = 'https://www.bradford.ac.uk/undergraduate/apply/'
                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="entryReq __undergraduate"><h3>Undergraduate</h3><table><tbody><tr><th rowspan="2">Qualification</th><th colspan="3">UCAS tariff points</th></tr><tr><th>136+</th><th>120 - 135</th><th>96 - 119</th></tr><tr><td>Senior Secondary School Graduation Certificate / 高中毕业证书</td><td colspan="3">Foundation Programme required</td></tr></tbody></table></div> """
                    ]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                department_dict = {
                    "Animation":
                    "Faculty of Engineering and Informatics",
                    "Biomedical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Business Computing":
                    "Faculty of Engineering and Informatics",
                    "Chemical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Civil and Structural Engineering":
                    "Faculty of Engineering and Informatics",
                    "Clinical Technology":
                    "Faculty of Engineering and Informatics",
                    "Computer Science":
                    "Faculty of Engineering and Informatics",
                    "Computer Science for Cyber Security":
                    "Faculty of Engineering and Informatics",
                    "Computer Science for Games":
                    "Faculty of Engineering and Informatics",
                    "Film and Television Production":
                    "Faculty of Engineering and Informatics",
                    "Film and Visual Effects Technology":
                    "Faculty of Engineering and Informatics",
                    "Game Design and Development":
                    "Faculty of Engineering and Informatics",
                    "Graphics for Games":
                    "Faculty of Engineering and Informatics",
                    "Mechanical Engineering":
                    "Faculty of Engineering and Informatics",
                    "Software Engineering":
                    "Faculty of Engineering and Informatics",
                    "Virtual and Augmented Reality":
                    "Faculty of Engineering and Informatics",
                    "MPhysiotherapy - Sport and Exercise Medicine MPhysio":
                    "Faculty of Health Studies",
                    "Nursing (Adult)":
                    "Faculty of Health Studies",
                    "Nursing (Adult) – Harrogate and District NHS Trust":
                    "Faculty of Health Studies",
                    "Nursing (Mental Health)":
                    "Faculty of Health Studies",
                    "Occupational Therapy":
                    "Faculty of Health Studies",
                    "Physiotherapy":
                    "Faculty of Health Studies",
                    "Public Health and Community Wellbeing":
                    "Faculty of Health Studies",
                    "Archaeology":
                    "Faculty of Life Sciences",
                    "Biomedical Science":
                    "Faculty of Life Sciences",
                    "Certificate of International Foundation Studies":
                    "Faculty of Life Sciences",
                    "Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Analytical Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Industrial Experience":
                    "Faculty of Life Sciences",
                    "Chemistry - Materials Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Mathematical and Computational Chemistry":
                    "Faculty of Life Sciences",
                    "Chemistry - Medicinal Chemistry":
                    "Faculty of Life Sciences",
                    "Clinical Sciences":
                    "Faculty of Life Sciences",
                    "Forensic and Medical Sciences":
                    "Faculty of Life Sciences",
                    "Forensic Archaeology and Anthropology":
                    "Faculty of Life Sciences",
                    "Forensic Science":
                    "Faculty of Life Sciences",
                    "Foundation in Clinical Sciences/Medicine":
                    "Faculty of Life Sciences",
                    "Optometry":
                    "Faculty of Life Sciences",
                    "Pharmacy":
                    "Faculty of Life Sciences",
                    "Pharmacy 5 years (including pre-registration training)":
                    "Faculty of Life Sciences",
                    "Accounting and Finance":
                    "Faculty of Management, Law & Social Sciences",
                    "Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Business Studies and Law":
                    "Faculty of Management, Law & Social Sciences",
                    "Economics":
                    "Faculty of Management, Law & Social Sciences",
                    "Finance and Economics":
                    "Faculty of Management, Law & Social Sciences",
                    "Human Resource Management":
                    "Faculty of Management, Law & Social Sciences",
                    "International Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Law":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Commercial Law)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Criminal Law)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law (Social Justice)":
                    "Faculty of Management, Law & Social Sciences",
                    "Law with Business and Management":
                    "Faculty of Management, Law & Social Sciences",
                    "Marketing":
                    "Faculty of Management, Law & Social Sciences",
                    "Criminology and Criminal Behaviour":
                    "Faculty of Management, Law & Social Sciences",
                    "Psychology":
                    "Faculty of Management, Law & Social Sciences",
                    "Psychology with Counselling":
                    "Faculty of Management, Law & Social Sciences",
                    "Social Work":
                    "Faculty of Management, Law & Social Sciences",
                    "Sociology":
                    "Faculty of Management, Law & Social Sciences",
                    "Working with Children, Young People and Families":
                    "Faculty of Management, Law & Social Sciences",
                }
                item['department'] = department_dict.get(
                    item['programme_en'].strip())
                print("item['department']: ", item['department'])

                # 将Full-time的情况获取duration、ucascode、alevel、ib、tuition_fee字段
                # # 1.duration
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&duration=duration&level=ug&year=y2019&attendance=fulltime
                # duration_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&duration=duration&level=ug&year=y2019&attendance=fulltime"
                # # print("duration_url: ", duration_url)
                # duration = json.loads(requests.get(duration_url).text).get("data")
                # # print("duration: ", duration)
                # if duration != None:
                #     duration_list = getIntDuration(''.join(duration))
                #     if len(duration_list) == 2:
                #         item['duration'] = duration_list[0]
                #         item['duration_per'] = duration_list[-1]
                # # print("item['duration'] = ", item['duration'])
                # # print("item['duration_per'] = ", item['duration_per'])
                #
                # # 2.ucascode
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&ucasCode=ucasCode&level=ug&year=y2019&attendance=fulltime
                # ucascode_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&ucasCode=ucasCode&level=ug&year=y2019&attendance=fulltime"
                # # print("ucascode_url: ", ucascode_url)
                # ucascode = json.loads(requests.get(ucascode_url).text).get("data")
                # # print("ucascode: ", ucascode)
                # if ucascode is not None:
                #     item['ucascode'] = ''.join(ucascode).strip()
                # # print("item['ucascode']: ", item['ucascode'])
                #
                #
                # # 3.alevel、ib
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&entry=entry&level=ug&year=y2019&attendance=filltime
                # entry_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&entry=entry&level=ug&year=y2019&attendance=fulltime"
                # # print("entry_url: ", entry_url)
                # entry = json.loads(requests.get(entry_url).text).get("data")
                # # print("entry: ", entry)
                # if entry is not None:
                #     entry_response = etree.HTML(entry)
                #     alevel = entry_response.xpath("//strong[contains(text(),'A levels')]/../../following-sibling::div//text()")
                #     # print("alevel: ", alevel)
                #     item['alevel'] = clear_lianxu_space(alevel)
                #
                #     ib = entry_response.xpath(
                #         "//strong[contains(text(),'International Baccalaureate requirements')]/../../following-sibling::div//text()")
                #     # print("ib: ", ib)
                #     item['ib'] = clear_lianxu_space(ib)
                #
                #     ielts_desc = entry_response.xpath(
                #         "//strong[contains(text(),'English language requirements')]/../../following-sibling::div/p[1]//text()")
                #     item['ielts_desc'] = clear_lianxu_space(ielts_desc)
                #     print("item['ielts_desc']: ", item['ielts_desc'])
                #
                #     ielts_dict = get_ielts(item['ielts_desc'])
                #     item['ielts'] = ielts_dict.get('IELTS')
                #     item['ielts_l'] = ielts_dict.get('IELTS_L')
                #     item['ielts_s'] = ielts_dict.get('IELTS_S')
                #     item['ielts_r'] = ielts_dict.get('IELTS_R')
                #     item['ielts_w'] = ielts_dict.get('IELTS_W')
                #     print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
                #
                # # 4.tuition_fee
                # # https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/biomedical-engineering-beng/&fees=fees&level=ug&year=y2019&attendance=fulltime
                # tuition_fee_url = "https://www.bradford.ac.uk/courses/ug/api.php?uri=/courses/ug/" + key_url + "/&fees=fees&level=ug&year=y2019&attendance=fulltime"
                # # print("tuition_fee_url: ", tuition_fee_url)
                # tuition_fee = json.loads(requests.get(tuition_fee_url).text).get("data")
                # # print("tuition_fee: ", tuition_fee)
                # if tuition_fee is not None:
                #     tuition_fee_response = etree.HTML(tuition_fee)
                #     tuition_fee_str = tuition_fee_response.xpath("//div[@id='tuitionFees']//p[contains(text(),'International:')]//text()")
                #     # print("tuition_fee_str: ", tuition_fee_str)
                #     tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee_str))
                #
                #     if len(tuition_fee_re) > 0:
                #         item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                # # print("item['tuition_fee']: ", item['tuition_fee'])

                is_full_sand = response.xpath(
                    "//select[@id='variant-attendance-mode']/option//text()"
                ).extract()
                clear_space(is_full_sand)
                item['other'] = ','.join(is_full_sand).strip().strip(
                    ',').strip()
                print("is_full_sand: ", is_full_sand)

                if len(is_full_sand) > 0:
                    for f in is_full_sand:
                        if f == "Full-time":
                            mode = 'fulltime'
                            detail_dict = self.parse_detail(key_url, mode)
                            item['duration'] = detail_dict.get('duration')
                            item['duration_per'] = detail_dict.get(
                                'duration_per')
                            item['ucascode'] = detail_dict.get('ucascode')
                            item['alevel'] = detail_dict.get('alevel')
                            item['ib'] = detail_dict.get('ib')
                            item['ielts_desc'] = detail_dict.get('ielts_desc')
                            item['tuition_fee'] = detail_dict.get(
                                'tuition_fee')
                            ielts_dict = get_ielts(item['ielts_desc'])
                            item['ielts'] = ielts_dict.get('IELTS')
                            item['ielts_l'] = ielts_dict.get('IELTS_L')
                            item['ielts_s'] = ielts_dict.get('IELTS_S')
                            item['ielts_r'] = ielts_dict.get('IELTS_R')
                            item['ielts_w'] = ielts_dict.get('IELTS_W')
                            print(
                                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                                % (item['ielts'], item['ielts_l'],
                                   item['ielts_s'], item['ielts_r'],
                                   item['ielts_w']))
                        elif f == "Full-time with Sandwich Year":
                            mode = 'sandwich'
                            detail_dict = self.parse_detail(key_url, mode)
                            item['duration'] = detail_dict.get('duration')
                            item['duration_per'] = detail_dict.get(
                                'duration_per')
                            item['ucascode'] = detail_dict.get('ucascode')
                            item['alevel'] = detail_dict.get('alevel')
                            item['ib'] = detail_dict.get('ib')
                            item['ielts_desc'] = detail_dict.get('ielts_desc')
                            item['tuition_fee'] = detail_dict.get(
                                'tuition_fee')
                            ielts_dict = get_ielts(item['ielts_desc'])
                            item['ielts'] = ielts_dict.get('IELTS')
                            item['ielts_l'] = ielts_dict.get('IELTS_L')
                            item['ielts_s'] = ielts_dict.get('IELTS_S')
                            item['ielts_r'] = ielts_dict.get('IELTS_R')
                            item['ielts_w'] = ielts_dict.get('IELTS_W')
                            print(
                                "===item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                                % (item['ielts'], item['ielts_l'],
                                   item['ielts_s'], item['ielts_r'],
                                   item['ielts_w']))
                        yield item
                else:
                    yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: KingCollegeLondon_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.kcl.ac.uk/"
        item['university'] = "King's College London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Strand, London. WC2R 2LS, United Kingdom"
        print("===============================")
        print(response.url)
        try:
            # //div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree)
            print(programmeDegreeStr)
            degree_type = re.findall(
                r"(\s\w+)$|(\s\w+\s\(.*\))$|(\s\w+/\w+)$|(\s\w+/\w+/\w+)$",
                programmeDegreeStr)
            if len(degree_type) > 0:
                degree_type = list(degree_type[0])
            while '' in degree_type:
                degree_type.remove('')
            print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            programme = programmeDegreeStr.replace(item['degree_name'],
                                                   '').strip()
            item['programme_en'] = programme
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            if "," in item['ucascode']:
                item['ucascode'] = item['ucascode'].split(',')[0].strip()
            print("item['ucascode']: ", item['ucascode'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[2]/span
            duration = response.xpath(
                "//strong[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            durationStr = ''.join(duration)
            print(durationStr)
            # duration_re = re.findall(r"([a-zA-Z0-9]+\s)(year|month|week){1}", durationStr, re.I)
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print(duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    # print("d = ", d)
                    if len(d) > 0:
                        item['duration'] = int(
                            d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-2']
            includeDepartment = response.xpath(
                "//div[@class='tab tab-2']//p[contains(text(), 'Faculty')]/span//text()"
            ).extract()
            if len(includeDepartment) == 0:
                includeDepartment = response.xpath(
                    "//p[contains(text(), 'Department')]/span//text()"
                ).extract()
            clear_space(includeDepartment)
            item['department'] = ''.join(includeDepartment).strip()
            print("item['department'] = ", item['department'])

            # //div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            overview = response.xpath(
                "//div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            print("item['overview_en'] = ", item['overview_en'])

            # //div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div
            # modules = response.xpath("//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*").extract()
            modules = response.xpath(
                "//div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div[@class='inner right lop-to-measure']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*[1]/following-sibling::*|"
                "//h3[contains(text(),'Course Structure & Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//h3[contains(text(),'Teaching style')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//*[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'TEACHING')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|//strong[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*|"
                "//b[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en'] = ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@class='requirements EntryReqs_UKALevel clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//strong[contains(text(),'A-Level')]/../following-sibling::td[1]//text()"
                ).extract()
                if len(alevel) == 0:
                    alevel = response.xpath(
                        "//div[@class='requirements EntryReqs_UKALevel clearfix']//div[@class='required-grades']//text()//text()"
                    ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel).strip()[:160]
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                # "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()|"
                "//div[@class='requirements EntryReqs_UKIB clearfix']//div[@class='required-grades']//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../../text()"
                ).extract()
                if len(ib) == 0:
                    ib = response.xpath(
                        "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::td[1]//text()"
                    ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()[:160]
            print("item['ib'] = ", item['ib'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            entry_requirements = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]//text()"
            ).extract()
            # item['rntry_requirements'] =clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            item['require_chinese_en'] = '''<h4><b>Undergraduate entry</b></h4>
<p>The Senior High School Certificate and/or <i>Hui&nbsp;</i><i>Kao</i>&nbsp;are not considered suitable for direct entry to undergraduate study at King's. Applicants may wish to consider taking one of our International Foundation programmes (see below).</p>'''

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            IELTS = response.xpath(
                "//*[contains(text(),'English')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]/../following-sibling::td[1]//text()|"
                "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]//../following-sibling::td[1]//text()"
            ).extract()
            clear_space(IELTS)
            # print(IELTS)
            item['ielts_desc'] = ''.join(IELTS).strip()
            item['toefl_desc'] = item['ielts_desc']
            print("item['ielts_desc'] = ", item['ielts_desc'])

            if item['ielts_desc'] == "Band A":
                item["ielts"] = 7.5  # float
                item["ielts_l"] = 7.0  # float
                item["ielts_s"] = 7.0  # float
                item["ielts_r"] = 7.0  # float
                item["ielts_w"] = 7.0
                item["toefl"] = 109  # float
                item["toefl_l"] = 25  # float
                item["toefl_s"] = 25  # float
                item["toefl_r"] = 25  # float
                item["toefl_w"] = 27
            elif item['ielts_desc'] == "Band B" or item[
                    'department'] == "The Dickson Poon School of Law" or item[
                        'department'] == "Dental Institute" or "Medicine" in item[
                            'programme_en']:
                item["ielts"] = 7.0  # float
                item["ielts_l"] = 6.5  # float
                item["ielts_s"] = 6.5  # float
                item["ielts_r"] = 6.5  # float
                item["ielts_w"] = 6.5
                item["toefl"] = 100  # float
                item["toefl_l"] = 23  # float
                item["toefl_s"] = 23  # float
                item["toefl_r"] = 23  # float
                item["toefl_w"] = 25
            elif item['ielts_desc'] == "Band D" or "Biochemistry" in item[
                    'programme_en']:
                item["ielts"] = 6.5  # float
                item["ielts_l"] = 6.0  # float
                item["ielts_s"] = 6.0  # float
                item["ielts_r"] = 6.0  # float
                item["ielts_w"] = 6.0
                item["toefl"] = 92  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 23
            elif item['ielts_desc'] == "Band E":
                item["ielts"] = 6.0  # float
                item["ielts_l"] = 5.5  # float
                item["ielts_s"] = 5.5  # float
                item["ielts_r"] = 5.5  # float
                item["ielts_w"] = 5.5
                item["toefl"] = 80  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 20

            if item['ielts_desc'] == "":
                ielts_desc = response.xpath(
                    "//strong[contains(text(),'IELTS Academic')]/../../../following-sibling::td[1]//text()"
                ).extract()
                item['ielts_desc'] = ''.join(ielts_desc).strip()
            if item['toefl_desc'] == "":
                toefl_desc = response.xpath(
                    "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()"
                ).extract()
                item['toefl_desc'] = ''.join(toefl_desc).strip()

            if item['ielts'] == None:
                ielts_dict = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_dict.get('IELTS')  # float
                item["ielts_l"] = ielts_dict.get('IELTS_L')  # float
                item["ielts_s"] = ielts_dict.get('IELTS_S')  # float
                item["ielts_r"] = ielts_dict.get('IELTS_R')  # float
                item["ielts_w"] = ielts_dict.get('IELTS_W')
            if item['toefl'] == None:
                toefl_dict = get_ielts(item['toefl_desc'])
                item["toefl"] = toefl_dict.get('TOEFL')  # float
                item["toefl_l"] = toefl_dict.get('TOEFL_L')  # float
                item["toefl_s"] = toefl_dict.get('TOEFL_S')  # float
                item["toefl_r"] = toefl_dict.get('TOEFL_R')  # float
                item["toefl_w"] = toefl_dict.get('TOEFL_W')
                # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_fee = response.xpath(
                "//h3[contains(text(), 'Application procedure')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(application_fee)
            # print(''.join(application_fee))
            application_fee_re = re.findall(r"application\sfee.*£\d+",
                                            ''.join(application_fee))
            print("apply_fee: ", ''.join(application_fee_re))
            af = ''.join(application_fee_re).replace("application fee of",
                                                     "").replace("£",
                                                                 "").strip()
            if len(af) != 0:
                item['apply_fee'] = int(af)
                item['apply_pre'] = "£"
            print("item['apply_fee'] = ", item['apply_fee'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_documents = response.xpath(
                "//h3[contains(text(), 'Personal statement and supporting information')]/following-sibling::div[1]"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(application_documents))
            print("item['apply_documents_en'] = ", item['apply_documents_en'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            deadline = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]/div[@class='requirements uk clearfix']/div[@class='copy'][4]//text()"
            ).extract()
            clear_space(deadline)
            print(deadline)
            deadline_str = ''.join(deadline).strip()
            item['deadline'] = getStartDate(deadline_str)
            print("item['deadline'] = ", item['deadline'])

            # //div[@id='coursepage-fees-and-funding']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off']/ul[1]/li[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'The International tuition fee for the 2018-2019 ac')]//text()"
            ).extract()
            print("tuition_fee = ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+|\d+,\d+",
                                        ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) >= 1:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "£", "").replace(",", "").strip())
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])
            print("item['tuition_fee'] = ", item['tuition_fee'])

            # //div[@id='coursepage-career-prospect']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            career = response.xpath(
                "//div[@id='coursepage-career-prospect']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            # //b[contains(text(),'The interview')]/..|//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]
            interview_desc_en = response.xpath(
                "//b[contains(text(),'The interview')]/..|"
                "//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]|"
                "//b[contains(text(),'Interviewing')]/../following-sibling::*[1]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # //b[contains(text(),'Application deadline:')]/..
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline:')]/../text()"
            ).extract()
            item['deadline'] = remove_class(clear_lianxu_space(deadline))
            print("item['deadline'] = ", item['deadline'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h1>Applying to King&#39;s College London</h1>
<img alt="King's College London lecture at the Strand campus" height="330" width="780" src="/ImportedImages/0Prospectus/undergraduate/apply/generic-page-images/lecturer-2014-strand-6.29.jpg" />
<p><br />We're delighted that you're considering applying to King's. Once you've checked the information and&nbsp;<a class="sys_16" href="https://www.kcl.ac.uk/study/undergraduate/apply/entry-requirements/index.aspx">entry requirements</a>&nbsp;for your chosen course, you will need to follow the correct application procedure, depending on the type of study you're interested in:</p>
<div class="contentpage-accordion clearfix">
<h3 class="accordion-toggle">Undergraduate degree courses (UCAS)</h3>
<div class="accordion-content">
<p class="p1">For all full-time undergraduate higher education courses at universities and colleges in the UK you must make an online application via the Universities and Colleges Admissions Service - more commonly known as&nbsp; <a class="sys_16" href="http://www.ucas.com/">UCAS</a>.<br /><br /></p>
<h4 class="p1"><b>UCAS has three key functions:</b></h4>
<ol>
<li class="p4"><a class="sys_16" href="https://digital.ucas.com/search">Course search</a>:&nbsp;allows you to search for courses throughout the UK. Remember to always check King's&nbsp;online prospectus&nbsp;for the most detailed information on King's courses.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/register">Apply</a>:&nbsp;the UCAS online application system. You should use this to make your application(s) to King's. 'Apply' will allow you to apply to several different universities and/or courses at once.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/login">Track</a>:&nbsp;a central tracking system for following the progress of your different applications. King's will also provide you with an account for our own supplementary tracking and messaging system (called King's Apply).</li>
</ol>
<p class="p1">Please read the following guidelines before making your application.</p>
<h4 class="p1"><b><br />Who should use UCAS?</b></h4>
<ul>
<li class="p2">All applicants for full-time undergraduate courses at King's should apply through UCAS (with the exception of applicants from&nbsp; <a class="sys_16" href="http://www.kcl.ac.uk/usa">North America</a>&nbsp;who may use Common App if preferred).</li>
<li class="p2">All applicants for Nursing with registration (graduate entry) PG Dip</li>
<li class="p2">All applicants for Midwifery with registration (graduate entry) PG Dip</li>
</ul>
<p>You can apply through your school or college, or as an individual.</p>
<h4 class="p1"><b><br />When should I apply?</b></h4>
<p class="p2">You can apply to UCAS from 1 September for entry the following autumn, but remember you can start&nbsp; <a title="Undergraduate study" class="sys_0 sys_t0" href="/study/undergraduate/index.aspx">doing your research</a>, attending&nbsp;open days, and preparing your personal statement earlier than this.</p>
<p class="p2">The normal closing date for receipt of applications is&nbsp;15&nbsp;January.&nbsp;However if you are including Oxford or Cambridge, or to Medicine or Dentistry, then the closing date is&nbsp;15 October&nbsp;in the year prior to entry.&nbsp;</p>
<p class="p2">The UCAS website states a more flexible deadline for international students, however, any application received by King's after the above dates is considered late.</p>
<h4 class="p1"><b><br />How do I use UCAS?</b></h4>
<p class="p2">UCAS allows you to apply to a maximum of five courses per year, but only four of those may be Medicine/Dentistry courses.</p>
<p class="p2">You will need to create an account in UCAS 'Apply' and complete an application form. Your application will then be forwarded by UCAS to all of the universities you have applied to for us to consider.</p>
<p class="p2">If you have participated in a King's widening participation scheme such as K+, please ensure you note this in your application as advised by the&nbsp;<a title="Widening Participation" class="sys_0 sys_t0" href="/study/widening-participation/index.aspx">Widening Participation team</a>.</p>
<p class="p2">UCAS has detailed instructions on the&nbsp;<a class="sys_16" href="http://www.ucas.com/how-it-all-works/undergraduate">UCAS website</a>.</p>
<p class="p2">You can also read our&nbsp;<a title="Before you apply" class="sys_0 sys_t7240628" href="/study/undergraduate/apply/faqs/index.aspx">frequently asked questions about applying</a>.<a title="UCAS website" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Undergraduate degree courses (Common App)</h3>
<div class="accordion-content">

<p>King's College Lon</p>
<h4>UCAS or Common App?</h4>
<p>We will only consider applicants through The Common Application who have not also applied through UCAS.&nbsp;</p>
<h4>Who should use Common App?</h4>
<ul>
<li>Common App is an option to be used by students who will be classified as paying international fees. Therefore Home/EU fee students must use UCAS. Those who are unsure should use UCAS.</li>
</ul>
<ol>
<li>Common App is available for all programmes excluding Physiotherapy, Nursing, Medicine, Dentistry, and Nutrition and Dietetics. Students must use UCAS to apply to these programmes.</li>
<li>Students must apply to no more than a combined total of five courses (UCAS and Common App) within the UK.</li>
</ol>
<p>All Common App applicants will be expected to complete the supplement element of the Common Application. We strongly encourage students to submit their application by January 15, but the College may consider later applications up to May 1. Once a Common Application is submitted to King's, students will be registered on the College's MyApplication system through which they will be able to track the progress of their application.</p>
<h4>King&rsquo;s College London Common Application timeline</h4>
<ol>
<li>Applicant applies to King&rsquo;s using the&nbsp;<a class="sys_16" href="http://www.commonapp.org/">Common Application form</a>.</li>
<li>The application is transferred onto the KCL Admissions Portal by King&rsquo;s admissions staff. This process may take approximately 30 days, depending upon when you submitted your application.</li>
<li>Once the application has been successfully inputted into the system, the applicant receives login details for the &lsquo;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">MyApplication</a>&rsquo; admissions portal. This is used to track the progress of an application and communicate with admissions staff</li>
<li>Applicants will be contacted through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>&nbsp;in the event that any further supporting documents are required before the application can be fully processed.</li>
<li>Decisions on completed applications submitted by January 15 will be made before March 31st. All notifications of decisions will be sent through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>.</li>
</ol>
<p>&nbsp;</p>

</div>



<h3 class="accordion-toggle">Post Qualification Nursing BSc Programmes &amp; Free Standing Courses</h3>
<div class="accordion-content">

<p>Applications for our post qualification nursing BSc programmes should be made direct to King&rsquo;s, through the King&rsquo;s Apply portal. <a title="Apply now" class="sys_16" onclick="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" onkeypress="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Intercalated BScs (iBScs)</h3>
<div class="accordion-content">

<p>Intercalated BScs (iBScs) programmes are for medical, dental and veterinary students. If you are from another university and you are interested in studying an iBSc at King&rsquo;s, then please see our detailed information on <a title="Intercalated BScs: How to apply" class="sys_0 sys_t7240628" href="/study/subject-areas/intercalated/how-to-apply.aspx">how to apply</a>. If you are a current King&rsquo;s student, please refer to application information on the <a class="sys_16" href="https://internal.kcl.ac.uk/lsm/students/ug/intercalated-bsc/how-to-apply.aspx">King's internal website</a>.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Transfers to undergraduate degree courses</h3>
<div class="accordion-content">

<p>Some of our academic departments may consider transfer applications from suitably qualified students currently attending other universities. Visit the&nbsp;<a class="sys_0 sys_t2452" href="https://www.kcl.ac.uk/study/undergraduate/apply/transferring-to-kings.aspx">transferring to King&rsquo;s web page</a>&nbsp;for more information. Transfer applications must be submitted through UCAS</p>
<p><a title="UCAS" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">King's International Foundation Programme</h3>
<div class="accordion-content">

<p>Applications for our one year full-time International Foundation academic preparation course should be made direct to King's, through the <a class="sys_16" href="https://apply.kcl.ac.uk/">King&rsquo;s Apply portal</a>. We have detailed guidance on the supporting documentation needed for your application on the relevant International Foundation Programme course web page.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Study Abroad</h3>
<div class="accordion-content">

<p>King's welcomes students currently enrolled in universities outside the UK to <a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/">participate</a> on a study abroad programme.&nbsp;</p>
<p>You can study abroad at King's either as an exchange or a study abroad fee-paying student for the full academic year (starting in September) or for one semester only (September-December or January-June).&nbsp;</p>
<p>Visit our&nbsp;<a class="sys_16" title="Study abroad" href="/study/abroad/index.aspx">Study Abroad web pages</a>&nbsp;to find out more.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>


</div>

<h3><br />After you&rsquo;ve applied</h3>
<p>Your application can be tracked using our&nbsp;<a class="sys_16" title="King&#39;s application portal" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>, where you can:</p>
<ul>
<li>
<p>see offer details</p>
</li>
<li>
<p>check if you&rsquo;ve been invited to interview</p>
</li>
<li>
<p>apply for accommodation</p>
</li>
<li>
<p>learn more about the &lsquo;points-based&rsquo; visa system</p>
</li>
</ul>
<p>After you have applied to King's, we will send you a username and password so you can access these pages. To contact us about your application during the application year, please use our <a title="King&#39;s application portal" class="sys_16" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>&nbsp;.</p>
<p><a class="sys_0 sys_t2452" title="Tracking your application" href="https://www.kcl.ac.uk/study/undergraduate/apply/faqs/tracking-your-application.aspx">Read our FAQs on tracking your application</a></p>


<div class="contentpage-accordion clearfix">
<br />


<h3 class="accordion-toggle">Cancellation rights</h3>
<div class="accordion-content">

<p class="MRNoHead2">Please note, these terms and conditions apply to all levels of study.&nbsp; For applications to undergraduate study, we also advise applicants to contact UCAS directly for details of your cancellation rights.</p>
<p class="MRNoHead2">1.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; You have the right to cancel your acceptance of a place at King&rsquo;s for any reason (including if you change your mind) during a fourteen (14) day cancellation period (the &ldquo;Cancellation Period&rdquo;), which will start on the day you accept an offer from King&rsquo;s.</p>
<p class="MRNoHead2">1.2 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To cancel your acceptance, you must clearly inform us in writing of your decision to cancel before the Cancellation Period has expired. We ask that you do this by sending a message through &ldquo;King&rsquo;s Apply&rdquo;. Alternatively, you may contact the King&rsquo;s Admissions Office by letter or email. You may also use the <a title="Cancellation Form - Kings College London" class="sys_17" href="/study/assets/word/admissions/v.2-cancellation-form.docx">Cancellation Form </a>to notify us of your decision to cancel.</p>
<p class="MRNoHead2">1.3 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To meet the cancellation deadline, it is sufficient for you to send your communication concerning your exercise of the right to cancel before the Cancellation Period has expired. We do not have to have received it before the expiry of the Cancellation Period.</p>
<p class="MRNoHead2">1.4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; If you cancel your acceptance within the 14 day Cancellation Period, we will reimburse any tuition fee payment including any deposit received from you as soon as we can, and no later than 14 days after the day on which you informed us of your decision to cancel your acceptance.</p>

</div>


</div>

"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: UniversityOfHertfordshire_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "University of Hertfordshire"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath(
                "//div[@class='banner__caption banner__caption--below']//text()"
            ).extract()
            department = ''.join(department).replace("in the ", "").strip()
            item['department'] = department
            # print("item['department']: ", item['department'])

            # 专业、学位类型
            degree_name = response.xpath(
                "//span[@class='color--red']/../text()").extract()
            degree_name_str = ''.join(degree_name).strip()
            item['degree_name'] = degree_name_str.replace("(Hons)", "").strip()
            # print("item['degree_name']: ", item['degree_name'])

            programme_en = response.xpath(
                "//span[@class='color--red']//text()|//nav[@class='breadcrumb']//li[contains(text(),'Physics')]//text()"
            ).extract()
            item['programme_en'] = ''.join(programme_en).replace(
                degree_name_str, "").strip()
            # print("item['programme_en']: ", item['programme_en'])

            if "online" not in item['programme_en'].lower():
                if item['degree_name'] == "":
                    # print("*********")
                    degree_name_re = re.findall(r"^.*\(Hons\)",
                                                item['programme_en'])
                    d_re = ''.join(degree_name_re).strip()
                    item['degree_name'] = d_re.replace("(Hons)", "").strip()
                    item['programme_en'] = item['programme_en'].replace(
                        d_re, "").strip()
                print("item['degree_name']1: ", item['degree_name'])
                print("item['programme_en']1: ", item['programme_en'])
                # if item['degree_name'] == "":
                #     print("*****111****")

                duration = response.xpath(
                    "//h4[contains(text(),'Course length')]/following-sibling::div[1]//text()"
                ).extract()
                clear_space(duration)
                # print("duration: ", duration)
                item['other'] = ' '.join(duration).strip()
                # print("item['other']: ", item['other'])
                duration_str = ''.join(duration)

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                location = response.xpath(
                    "//h4[contains(text(),'Locations')]/following-sibling::div[1]//text()"
                ).extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                # print("item['location'] = ", item['location'])

                # //div[@id='overview']
                overview = response.xpath(
                    "//section[@data-section='section-overview']").extract()
                overview_en = remove_class(clear_lianxu_space(overview))
                item['overview_en'] = overview_en
                # print("item['overview_en']: ", item['overview_en'])
                # if item['overview_en'] == "":
                #     print("*****111****")

                career_en = response.xpath(
                    "//h3[contains(text(),'Teaching methods')]/preceding-sibling::*"
                ).extract()
                if len(career_en) == 0:
                    career_en = response.xpath(
                        "//h3[contains(text(),'Careers')]|//h3[contains(text(),'Careers')]/following-sibling::*"
                    ).extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(career_en)).replace(
                        "<h2>Course details</h2>", "").strip()
                # print("item['career_en']: ", item['career_en'])
                # if item['career_en'] == "":
                #     print("*****111****")

                assessment_en = response.xpath(
                    "//h3[contains(text(),'Teaching methods')]|//h3[contains(text(),'Teaching methods')]/following-sibling::*"
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])
                # if item['assessment_en'] == "":
                #     print("*****111****")

                modules = response.xpath(
                    "//div[@id='module-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                # //div[@id='fees']
                feeContent = response.xpath(
                    "//h4[contains(text(),'International Students')]/following-sibling::h5[contains(text(), 'Full time')]/following-sibling::*[1]//text()"
                ).extract()
                if len(feeContent) == 0:
                    feeContent = response.xpath(
                        "//h4[contains(text(),'International Students')]/following-sibling::p//strong[contains(text(),'Full time')]/..//text()"
                    ).extract()
                clear_space(feeContent)
                # print("feeContent: ", feeContent)
                feelist = re.findall(r"£[\d,]+", ''.join(feeContent))
                if len(feelist) > 0:
                    item['tuition_fee'] = int(feelist[0].replace(
                        '£', '').replace(',', '').strip())
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //div[@id='how-to-apply']
                entry_requirements = response.xpath(
                    "//h2[contains(text(),'How to apply')]/preceding-sibling::*//text()"
                ).extract()
                rntry_requirements = clear_lianxu_space(entry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                # print("entry_requirementsStr: ", entry_requirementsStr)
                # ielts = re.findall(r"IELTS[\sa-zA-Z]*\d\.?\d?[\sa-z\(\)]*\d\.?\d?[\sa-z\(\)]{1,100}", rntry_requirements)
                # # print("ielts: ", ielts)
                item[
                    'ielts_desc'] = 'https://www.herts.ac.uk/international/new-international-students/applying-to-the-university-of-hertfordshire/international-entry-requirements'
                # print("item['ielts_desc']: ", item['ielts_desc'])

                if "Humanities" in item['programme_en'] or "Nursing" in item['programme_en'] or "Social Work" in item['programme_en'] or\
                        item['degree_name'] == "BSc" and item['programme_en'] == "Nutrition" or \
                                        item['degree_name'] == "BSc" and item['programme_en'] == "Pharmaceutical Science":
                    item[
                        'ielts_desc'] = 'IELTS 6.5 (minimum band scores also apply)'
                    item['ielts'] = '6.5'
                    item['ielts_l'] = '6.5'
                    item['ielts_s'] = '6.5'
                    item['ielts_r'] = '6.5'
                    item['ielts_w'] = '6.5'
                elif item['degree_name'] == "BSc" and item[
                        'programme_en'] == "Physiotherapy" or item[
                            'degree_name'] == "BSc" and item[
                                'programme_en'] == "Dietetics":
                    item[
                        'ielts_desc'] = 'IELTS 7.0 (minimum band scores also apply)'
                    item['ielts'] = '7.0'
                    item['ielts_l'] = '7.0'
                    item['ielts_s'] = '7.0'
                    item['ielts_r'] = '7.0'
                    item['ielts_w'] = '7.0'
                else:
                    item[
                        'ielts_desc'] = 'International English Language Testing System (IELTS) score of 6.0 (with no less than 5.5 in any band) for undergraduate or 6.5 (with no less than 5.5 in any band) for postgraduate.'
                    item['ielts'] = '6.0'
                    item['ielts_l'] = '5.5'
                    item['ielts_s'] = '5.5'
                    item['ielts_r'] = '5.5'
                    item['ielts_w'] = '5.5'
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # //div[@class='how-to-apply-table']/preceding-sibling::*[position()>1]
                item['alevel'] = None
                alevel = response.xpath(
                    "//p[contains(text(),'UCAS points')]//text()").extract()
                if len(alevel) == 0:
                    alevel = response.xpath(
                        "//li[contains(text(),'UCAS points')]//text()"
                    ).extract()
                    if len(alevel) == 0:
                        alevel = response.xpath(
                            "//*[contains(text(),'UCAS points')]//text()"
                        ).extract()
                        if len(alevel) == 0:
                            alevel = response.xpath(
                                "//strong[contains(text(),'A-Levels')]/../following-sibling::p[1]//text()|"
                                "//strong[contains(text(),'A Levels')]/../following-sibling::*[1]//text()"
                            ).extract()
                if len(alevel) > 0:
                    item['alevel'] = remove_class(
                        clear_lianxu_space([alevel[0]]))
                    if item['programme_en'] == "Nursing (Mental Health)":
                        item['alevel'] = remove_class(
                            clear_lianxu_space(alevel))
                print("item['alevel']: ", item['alevel'])

                # //p[contains(text(),' IB')]
                item['ib'] = None
                ib = response.xpath(
                    "//p[contains(text(),' IB')]//text()|//*[contains(text(),'IB -')]//text()|//*[contains(text(),'IB –')]//text()"
                ).extract()
                if len(ib) == 0:
                    ib = response.xpath(
                        "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::p[1]//text()|"
                        "//h3[contains(text(),'International Baccalaureate')]/following-sibling::p[1]//text()"
                    ).extract()
                if len(ib) > 0:
                    item['ib'] = remove_class(clear_lianxu_space([ib[-1]]))
                print("item['ib']: ", item['ib'])

                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<tr><th scope="row" id="table76505r3c1"> Undergraduate (year 1)</th><td headers="table76505r1c2 table76505r3c1">
Chinese 3-year Senior High School certificate with 85% or above
</td><td headers="table76505r1c3 table76505r3c1">
IELTS 6.0 with no less than 5.5 in any band
</td></tr><tr><th scope="row" id="table76505r4c1"> Undergraduate (year 2/3)</th><td headers="table76505r1c2 table76505r4c1">
Chinese 3-year College Diploma in related subject area with 70% or above
<br /><br />
SQA HND in related subject area with overall B grade
<br /><br />
BTEC HND in related subject area with overall Merit profile

</td><td headers="table76505r1c3 table76505r4c1">
IELTS 6.0 with no less than 5.5 in any band
</td></tr>"""
                    ]))
                item[
                    'apply_proces_en'] = "https://www.herts.ac.uk/international/new-international-students/applying-to-the-university-of-hertfordshire"

                start_date = response.xpath(
                    "//div[@class='how-to-apply-table']//table//td[contains(text(),'Full')]/preceding-sibling::*[2]//text()"
                ).extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                start_date = list(set(start_date))
                start_end = []
                for s in start_date:
                    s_p = s.split("/")
                    # print(s_p)
                    start_end.append(s_p[-1] + "-" + s_p[-2] + "-" + s_p[0])
                item['start_date'] = ','.join(start_end).strip()
                # print("item['start_date'] = ", item['start_date'])

                ucascode = response.xpath(
                    "//span[contains(text(),'UCAS code')]/../text()").extract(
                    )
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                if ',' in ''.join(ucascode):
                    print("****")
                    ucascode_sp = ''.join(ucascode).split(',')
                    # print(ucascode_sp)
                    for u in ucascode_sp:
                        if len(u.strip()) == 4:
                            item['ucascode'] = u.strip()
                            print("item['ucascode']1 = ", item['ucascode'])
                            yield item
                else:
                    item['ucascode'] = ''.join(ucascode).strip()
                    # print("item['ucascode'] = ", item['ucascode'])
                    yield item
                # print("len: ", len(ucascode))

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: AstonUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.aston.ac.uk/"
        item['university'] = "Aston University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Aston University,Birmingham, B4 7ET"
        print("======================================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='skiplinks']//text()").extract()
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print(programmeDegreetypeStr)
            degree_type = re.findall(
                r"^\w+/\w+|^\w+\s/\s\w+|^\w+\s\(Hons\)|^[BML]\w{1,7}\s",
                programmeDegreetypeStr)
            # print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            if item['degree_name'] == "Business":
                item['degree_name'] = ""
            programme = programmeDegreetypeStr.replace(item['degree_name'],
                                                       "").strip()
            item['programme_en'] = ''.join(programme).replace(
                "(Hons)", "").strip().strip("in").strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            alevel = response.xpath(
                "//span[contains(text(),'A Levels')]/../../../../../../following-sibling::*[1]//text()|"
                "//span[contains(text(),'A-Levels')]/../../../../../../following-sibling::*[1]//text()"
            ).extract()
            clear_space(alevel)
            # print("alevel: ", alevel)
            item['alevel'] = ''.join(alevel).strip()

            if item['alevel'] == "":
                alevel1 = response.xpath(
                    "//strong[contains(text(),'A Levels')]/..//text()"
                ).extract()
                if "A Levels" in ''.join(alevel1).strip():
                    alevelindex = ''.join(alevel1).strip().index('A Levels')
                    item['alevel'] = ''.join(
                        ''.join(alevel1).strip()[alevelindex:]).strip()
            # print("item['alevel']: ", item['alevel'])
            if len(item['alevel']) > 300:
                item['alevel'] = item['alevel'][:301]

            ib = response.xpath(
                "//span[contains(text(),'IB')]/../../../../../../following-sibling::*[2]//text()|"
                "//strong[contains(text(),'IB')]/..//text()").extract()
            clear_space(ib)
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            if len(item['ib']) > 300:
                item['ib'] = item['ib'][:301]

            overview = response.xpath(
                "//a[contains(text(),'Course overview')]/../../../../../..|"
                "//*[contains(text(), 'Course outline')]/../../../../../../div/following-sibling::div[1]|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Modules')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Sample module options')]/../../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/..|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Core modules:')]/../preceding-sibling::*|"
                "//strong[contains(text(),'Courses')]/../../following-sibling::div[1]|"
                "//*[contains(text(), 'Programme outline and modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Modules')]/..|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample Module Options')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline & Modules')]/../../../../../following-sibling::div[1]//*[contains(text(),'Modules')]/preceding-sibling::*"
            ).extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//a[contains(text(),'Course Outline')]/../../../../../.."
                ).extract()
            item['overview_en'] = ''.join(
                remove_class(clear_lianxu_space(overview)).replace(
                    "<br>", "").strip().split("\n")).strip()
            # if item['overview_en'] == "":
            #     print("overview 为空")
            # print("item['overview_en'] = ", item['overview_en'])

            modules_en = response.xpath(
                "//*[contains(text(),'modules:')]/../..|//strong[contains(text(),'Programme content')]/../preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(),'Modules')]/../../..").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath(
                        "//*[contains(text(),'Modules')]/../..").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath(
                            "//*[contains(text(),'Modules')]/..").extract()
                        if len(modules_en) == 0:
                            modules_en = response.xpath(
                                "//*[contains(text(),'What you will study')]/../../../../../following-sibling::*"
                            ).extract()
                            if len(modules_en) == 0:
                                modules_en = response.xpath(
                                    "//*[contains(text(), 'Subject guide and modules')]/../../../../../../div/following-sibling::div[1]"
                                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            if item['modules_en'] == "":
                item['modules_en'] = None
            print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath(
                "//*[contains(text(),'Your future career')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Your future career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional development programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional Development Programme')]/../../../../div/following-sibling::*|"
                # "//*[contains(text(),'Professional Development Programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Graduate destinations')]/../../../../../following-sibling::*|"
                "//a[contains(text(),'Career')]/../../../../../following-sibling::*|"
                "//a[contains(text(),'Personal Development')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional accreditation')]/../../../../../following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])
            # if item['career_en'] == "":
            #     print("*** career")

            # following-sibling::*
            assessment_en = response.xpath(
                "//a[@id='learning'][contains(text(),'Learning, teaching & assessment')]/../..|"
                "//a[@class='panel-event'][contains(text(),'Learning, teaching & assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, Teaching & Assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, Teaching and Assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching and assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching and assessments')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching & assesment')]/../../../../../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            rntry_requirements = response.xpath(
                "//*[contains(text(),'Entry requirements & fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements & Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Key information and entry requirements')]/../../../../../..//text()|"
                "//*[contains(text(),'Key information for applicants & entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements')]/../../../../../following-sibling::*//text()"
            ).extract()
            start_date = rntry_requirements
            # print("start_date: ", start_date)
            item['apply_desc_en'] = "<div>" + clear_lianxu_space(
                rntry_requirements) + "</div>"
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])
            clear_space(start_date)
            duration_str = '; '.join(start_date)

            tuition_fee = response.xpath(
                "//*[contains(text(),'Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'fees')]/../../../../../following-sibling::*//text()|"
                "//strong[contains(text(),'Tuition fees')]/..//text()"
            ).extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//strong[contains(text(),'Fees:')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_re = re.findall(
                r"International.*?£\d+,\d+|non-EU.*?£\d+,\d+|MSc.*?£\d+,\d+|entry:£\d+,\d+|2018/2019:£\d+,\d+|£\d+,\d+\sfor\sOutside\sEU|£\d+,\d+",
                tuition_fee_str, re.I)
            tuition_fee_re += re.findall(r"£\d+,\d+", duration_str)
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) != 0:
                t = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
                # item['tuition_fee'] = int(''.join(t).replace(",", "").strip())
                # print("item['tuition_fee']1 = ", item['tuition_fee'])
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # duration = response.xpath(
            #     "//*[contains(text(),'Duration')]/following-sibling::*//text()|"
            #     "//*[contains(text(),'Duration')]/..//text()").extract()
            # ((One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten).{1,8}year)
            duration_re = re.findall(
                r'Duration.{1,85}|\d.{1,8}year|One.{1,8}year|Two.{1,8}year|Three.{1,8}year|Four.{1,8}year|Five.{1,8}year|Six.{1,8}year|Seven.{1,8}year|Eight.{1,8}year|Nine.{1,8}year|Ten.{1,8}year',
                duration_str, re.I)
            duration_re += re.findall(
                r'Duration.{1,80}|\d.{1,8}year|One.{1,8}year|Two.{1,8}year|Three.{1,8}year|Four.{1,8}year|Five.{1,8}year|Six.{1,8}year|Seven.{1,8}year|Eight.{1,8}year|Nine.{1,8}year|Ten.{1,8}year',
                item['overview_en'], re.I)
            # if len(duration) == 0:
            #     duration = response.xpath("//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()").extract()
            clear_space(duration_re)
            duration_str = ' '.join(duration_re)
            # print("duration_str: ", duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # https://www2.aston.ac.uk/about/termdates/2019-2020
            start_date_str = '; '.join(start_date)
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r'Start.{1,25}', start_date_str)
            # print("start_date_re", start_date_re)
            item['start_date'] = getStartDate(''.join(start_date_re))
            # print("item['start_date']: ", item['start_date'])

            # ielts_desc = ' '.join(start_date)
            # ielts_desc = re.findall(r'.{1,80}IELTS.{1,80}', ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            allcontent = response.xpath(
                "//div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-rho']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-delta']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma'][2]//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-upsilon']//text()"
            ).extract()
            clear_space(allcontent)
            department_1 = response.xpath(
                "//a[@href='/study/postgraduate/taught-programmes/abs/']//text()"
            ).extract()
            # print(department_1)
            if len(department_1) > 0:
                item['department'] = ''.join(department_1[0]).strip()
            department_re = re.findall(
                r"Life\s&\sHealth\sSciences\s-\sOSPAP|Aston\sBusiness\sSchool|Engineering\s&\sApplied\sScience|Languages\s&\sSocial\sSciences|Life\s&\sHealth\sSciences",
                ''.join(allcontent))
            # print("department_re: ", department_re)
            if item['department'] == "":
                if len(department_re) > 0:
                    item['department'] = ''.join(department_re[0]).strip()
            # print("item['department']: ", item['department'])

            # Aston Business School
            de_1 = [
                "full time mba",
                "executive mba - part time",
                "online mba",
                "the executive dba",
                "phd programme",
                "msc business analytics",
                "msc business & management",
                "msc business & management (online)",
                "msc information systems & business analysis",
                "msc supply chain management",
                "msc international business",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc strategy and international business",
                "msc entrepreneurship",
                "msc accounting & finance",
                "msc business economics & finance",
                "msc finance",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc investment analysis",
                "msc strategic marketing management ",
                "msc human resource management & business",
                "msc organisational behaviour",
                "msc work psychology & business",
                "international pre-masters",
            ]
            #Engineering & Applied Science
            de_2 = [
                "msc professional engineering",
                "msc computer science",
                "msc software engineering ",
                "msc software project management",
                "msc professional engineering",
                "msc electrical power engineering and systems ",
                "msc telecommunications systems",
                "msc wireless communications and networking",
                "msc smart telecom and sensing networks (smartnet)",
                "msc photonic integrated circuits, sensors and networks (pixnet)",
                "msc professional engineering",
                "msc engineering management",
                "msc supply chain management",
                "msc engineering leadership & management",
                "msc supply chain leadership and management",
                "msc professional engineering",
                "msc mechanical engineering ",
                "msc product design ",
                "msc professional engineering",
            ]
            #Languages & Social Sciences
            de_3 = [
                "ma in forensic linguistics",
                "ma in the european union & international relations",
                "joint ma in multilevel governance & international relations",
                "double ma in europe & the world",
                "double ma in governance and international politics",
                "ma in international relations and global governance",
                "ma in sociology and social research",
                "ma in policy and social research",
                "ma in teaching english to speakers of other languages (tesol)",
                "ma in tesol and translation studies",
                "ma in tesol and translation studies",
                "ma in translation in a european context",
            ]
            # Life & Health Sciences
            de_4 = [
                "advanced hearing therapy practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "doctor of hearing therapy - professional doctorate",
                "biomedical science - msc",
                "biomedical sciences top modules - all standalone modules",
                "stem cells and regenerative medicine - msc",
                "clinical neurophysiology practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "neurophysiology - pgcert",
                "clinical science (neurosensory sciences) - msc",
                "doctor of optometry / doctor of ophthalmic science - professional doctorate",
                "graduate diploma in optometry - graduate diploma",
                "independent prescribing for optometrists - professional accreditation",
                "optometry / ophthalmic science - msc",
                "overseas pharmacists course (ospap) - full time pgdip / msc",
                "pharmacist independent prescribing - pgcert",
                "pharmacy (includes: msc pharmaceutical sciences, msc drug delivery, and msc pharmacokinetics) – msc",
                "psychiatric pharmacy by distance learning and practice - pgdip",
                "psychiatric pharmacy practice - msc",
                "psychiatric therapeutics by distance learning - pgcert",
                "cognitive neuroscience - msc",
                "health psychology (online) - msc",
                "health psychology (on campus) - msc",
            ]
            if item['department'] == "":
                for de1 in de_1:
                    if item['programme_en'] == de1:
                        item['department'] = "Aston Business School"
                        break
            if item['department'] == "":
                for de2 in de_2:
                    if item['programme_en'] == de2:
                        item['department'] = "Engineering & Applied Science"
                        break
            if item['department'] == "":
                for de3 in de_3:
                    if item['programme_en'] == de3:
                        item['department'] = "Languages & Social Sciences"
                        break
            if item['department'] == "":
                for de4 in de_4:
                    if item['programme_en'] == de4:
                        item['department'] = " Life & Health Sciences"
                        break

            if 'business' in item['programme_en'].lower():
                item['department'] = "Aston Business School"
            if 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['department'] = "Engineering & Applied Science"
            # print("item['department']1: ", item['department'])

            if item['department'] == "Aston Medical School":
                item['ielts'] = 7.5
                item['ielts_l'] = 7
                item['ielts_s'] = 7
                item['ielts_r'] = 7
                item['ielts_w'] = 7
                item['toefl'] = 109
                item['toefl_l'] = 26
                item['toefl_r'] = 26
                item['toefl_s'] = 23
                item['toefl_w'] = 28
            elif item['department'] == "Engineering & Applied Science" or item[
                    'department'] == "Languages & Social Sciences":
                item['ielts'] = 6
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
                item['toefl'] = 78
                item['toefl_l'] = 11
                item['toefl_r'] = 12
                item['toefl_s'] = 17
                item['toefl_w'] = 20
            else:
                item['ielts'] = 6.5
                item['ielts_l'] = 6
                item['ielts_s'] = 6
                item['ielts_r'] = 6
                item['ielts_w'] = 6
                item['toefl'] = 93
                item['toefl_l'] = 19
                item['toefl_r'] = 18
                item['toefl_s'] = 19
                item['toefl_w'] = 23
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #                 item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #        item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="tab-inner">
<div id="panelGroupHeader_73655987" class="tab-header-outer">
<div class="tab-header-inner">
<ul>
<li class="header"><h2><a href="#" class="panel-event">Undergraduate</a></h2></li>
<li class="expander"><a href="/international-students/your-country/east-asia/china/#" class="panel-event">Expand / Collapse</a>
</li>
</ul>
</div>
</div>
<div id="panelGroupBody_73655987" class="tab-body-outer">
<div class="tab-body-inner"><div class="ContentEditor"><p> <span style="line-height: 1.4em">Students who have achieved an average of 80% in the academic subjects in their Senior High School Leaving Certificate after 3 years of study may be considered for a Foundation programme.  </span><span style="line-height: 1.4em"> </span></p> <p>Students with 2 or 3 year University or College Diploma can be considered for undergraduate study - Year 1 entry. Applicants should be scoring a min of 80% average in relevant  academic subjects. <br /> <br />University students who have studied 1-2 years (full-time) at a recognised university may be eligible for first year entry, dependent on subjects, institution and grades. <span style="line-height: 1.4em"> </span></p> </div>
</div>
</div>
</div>
"""
                ]))
            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS Code')]/following-sibling::*[1]//text()|"
                "//strong[contains(text(),'UCAS Code')]/../text()|"
                "//div[@class='ContentEditor']//strong[3]/../text()|"
                "//strong[contains(text(),'UCAS code')]/..//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            if len(ucascode) > 0:
                for u in ucascode:
                    if len(u.replace(":", "").replace(".", "").strip()) == 4:
                        item['ucascode'] = u.replace(":",
                                                     "").replace(".",
                                                                 "").strip()
                        break
            print("item['ucascode']: ", item['ucascode'])

            # print(ucascode[:7])
            # if len(item['ucascode']) != 4 and len(ucascode) > 0:
            #     for u in ucascode[:6]:
            #         ucascode_uu_re = re.findall(r"[A-Z]\w{3}", u)
            #         print("ucascode_uu_re: ", ucascode_uu_re)

            if item['ucascode'] == "":
                ucascode_re = re.findall(r"UCAS\sCode.{1,8}", duration_str,
                                         re.I)
                if len(ucascode_re) == 0:
                    ucascode_re = re.findall(
                        r"UCAS\sCode.{1,8}",
                        remove_tags(clear_lianxu_space([response.text])), re.I)
                print("ucascode_re: ", ucascode_re)
                item['ucascode'] = ''.join(ucascode_re).replace(
                    "UCAS Code",
                    "").replace("UCAS code", "").replace(".", "").replace(
                        "(", "").replace(";", "").replace(":", "").strip()
            print("item['ucascode']1: ", item['ucascode'])
            if item['ucascode'] == "":
                print("**** ucascode")

            # if "/" in item['degree_name']:
            #     print("//////////")
            if item['programme_en'] == "Optometry":
                item['degree_name'] = "Bsc"
                item['ucascode'] = 'B510'
                yield item
                item['degree_name'] = "MOptom"
                item['ucascode'] = 'B512'
                yield item
            elif item['programme_en'] == "Biomedical Engineering":
                item['degree_name'] = "BEng"
                item['ucascode'] = 'H542'
                yield item
                item['degree_name'] = "MEng"
                item['ucascode'] = 'H541'
                yield item
            elif item['programme_en'] == "Psychology":
                item['duration'] = 3
                item['ucascode'] = 'C800'
                yield item
                item['duration'] = 4
                item['ucascode'] = 'C801'
                yield item
            elif item[
                    'programme_en'] == "International Relations and Modern Languages (French/German/Spanish)":
                item['ucascode'] = 'LR2C'
                yield item
                item['ucascode'] = 'LR2G'
                yield item
                item['ucascode'] = 'LR2K'
                yield item
            elif item[
                    'programme_en'] == "International Business and Modern Languages":
                item['ucascode'] = 'NR11'
                yield item
                item['ucascode'] = 'NR12'
                yield item
                item['ucascode'] = 'NR14'
                yield item
                item['ucascode'] = 'NR24'
                yield item
                item['ucascode'] = 'NR33'
                yield item
                item['ucascode'] = 'NR44'
                yield item
                item['ucascode'] = 'NR15'
                yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 17

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        try:
            overview = response.xpath(
                "//h3[contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en")
            # print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/following-sibling::*[1]/preceding-sibling::*"
            ).extract()
            # modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en")
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h2[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<7]|"
                "//h2[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
            #     print("***assessment_en")
            # print("item['assessment_en'] = ", item['assessment_en'])

            # //a[contains(text(), 'Faculty of')]|//a[contains(text(), 'School of')]
            department = response.xpath(
                "//a[contains(text(), 'Faculty of')]//text()|"
                "//a[contains(text(), 'School of')]//text()").extract()
            item['department'] = remove_class(
                clear_lianxu_space(department)).replace(
                    "academic staff in the", "").strip()
            # if item['department'] == "":
            #     print("***department")
            # print("item['department'] = ", item['department'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/following-sibling::*[1]//text()"
            ).extract()
            alevel_str = ''.join(alevel).strip()
            if alevel_str == "Overall:" or alevel_str == "Overall":
                alevel = response.xpath(
                    "//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()"
                ).extract()
                alevel_str = ''.join(alevel).replace(
                    "Overall", "").strip().strip(":").strip()
                # print("***alevel")
            item['alevel'] = clear_space_str(alevel_str)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            ib_str = ''.join(ib).strip()
            if ib_str == "Overall:":
                ib = response.xpath(
                    "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()"
                ).extract()
                ib_str = ''.join(ib).strip()
                # print("***ib")
            item['ib'] = ib_str
            # print("item['ib'] = ", item['ib'])

            ielts_str = response.xpath(
                "//div[@id='entry-collapse']//h2[contains(text(),'English')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            item['ielts_desc'] = ''.join(ielts_re).strip()
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            application_open_date = response.xpath(
                "//div[@class='p-3 p-xl-4 text-center text-light']//text()"
            ).extract()
            clear_space(application_open_date)
            # print("application_open_date: ", ''.join(application_open_date))
            item['application_open_date'] = getStartDate(
                ''.join(application_open_date))
            # if item['application_open_date'] == "":
            #     print("***application_open_date")
            # print("item['application_open_date'] = ", item['application_open_date'])

            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h2>Process</h2>
<ol><li>Choose the programmes you want to study. Still undecided? Search our <a href="/undergraduate">undergraduate degrees</a></li>
<li>Find out <a href="/apply/undergraduate/how-to-apply-through-ucas">how to apply through UCAS</a></li>
<li>Wait for universities to make their decisions, <a href="/apply/undergraduate/after-you-apply">learn what happens after you apply</a></li>
<li>Reply to your <a href="/apply/undergraduate/your-offer">university offers</a></li>
<li><a href="/apply/undergraduate/your-offer">Confirm your university place</a></li>
</ol>"""
                ]))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Undergraduate</h2>
<p>We do not accept the Chinese National University Entrance Examination. However, you can apply to study for an <a href="http://isc.surrey.ac.uk/programmes/international-foundation-year?ch=uniweb&amp;cc=uniweb&amp;cid=uniweb&amp;utm_source=signposting&amp;utm_medium=signposting&amp;utm_campaign=uniweb&amp;_ga=2.246594701.825790074.1509959240-87246970.1500115796">International Foundation Year</a> at our <a href="http://isc.surrey.ac.uk/">International Study Centre</a>, which will prepare you for a full undergraduate degree course.</p>"""
                ]))

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_str = (''.join(programme_en).split("–"))[0].strip()
            print(programme_en_str)

            if "2019" in ''.join(programme_en):
                item['start_date'] = '2019'
            # print("item['start_date'] = ", item['start_date'])

            # 判断可以拆分几条数据，ucascode、duration、degree_name
            is_degree_name = response.xpath(
                "//tbody[@class='w-100']/tr").extract()
            # print("is_degree_name: ", is_degree_name)
            print(len(is_degree_name))
            for i in range(len(is_degree_name)):
                print("****************" + str(i + 1) + "***************")
                degree_name_re = re.findall(r"\w+\s\(Hons\).*|\w+$",
                                            programme_en_str)
                if len(degree_name_re) > 0:
                    item['degree_name'] = ''.join(degree_name_re).strip()
                    item['programme_en'] = programme_en_str.replace(
                        item['degree_name'], '').strip()
                else:
                    item['programme_en'] = programme_en_str
                print("item['programme_en'] = ", item['programme_en'])

                degree_name_xpath = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[1]//text()").extract()
                clear_space(degree_name_xpath)
                item['degree_name'] = ''.join(degree_name_xpath).strip()
                print("item['degree_name'] = ", item['degree_name'])

                duration = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[2]//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                if len(duration) != 0:
                    duration_list = getIntDuration(''.join(duration))
                    # print("duration_list: ", duration_list)
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[4]//text()").extract()
                clear_space(ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                print("item['ucascode']: ", item['ucascode'])

                tick = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[3]//i[@class='icon icon-tick']").extract()
                clear_space(tick)
                print("tick: ", tick)
                print(len(tick))
                if len(tick) == 1:
                    item['other'] = 'Professional Training'
                print("item['other']: ", item['other'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 18

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.cardiffmet.ac.uk/"
        item['university'] = "Cardiff Metropolitan University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'Llandaff Campus, Western Avenue, Cardiff, CF5 2YB'
        # print("item['location'] = ", item['location'])
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programmeDegreetype = response.xpath("//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div/h1//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            if len(programmeDegreetype) == 0:
                programmeDegreetype = response.xpath(
                    "//div[@class='cstcoursetitle']/h1//text()").extract()

            clear_space(programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()
            if programmeDegreetypeStr == "":
                programmeDegreetypeStr = item['major_type1']
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            programmeDegreetypesplit = programmeDegreetypeStr.split("-")
            # print(programmeDegreetypesplit)
            if len(programmeDegreetypesplit) > 1:
                degreetype = programmeDegreetypesplit[-1]
                # print(degreetype)
                item['degree_name'] = degreetype
                programme = programmeDegreetypesplit[0]
                # print(programme)
                item['programme_en'] = ''.join(programme)
            else:
                programme = programmeDegreetypesplit[0]
                # print(programme)
                item['programme_en'] = ''.join(programme)
            item['degree_name'] = item['degree_name'].replace("(Hons)", "").replace("Degree", "").replace(" s", "").replace("(Joint Honours)", "").replace("(Franchised)", "").strip()
            print("item['degree_name']: ", item['degree_name'])
            if "(Top-Up)" in item['major_type1']:
                item['programme_en'] = item['programme_en'] + "-Up)"
            print("item['programme_en']: ", item['programme_en'])

            if "Foundation" not in item['major_type1']:
                department = response.xpath("//div[@class='crumbcontainer']/span/span[1]/a[1]//text()").extract()
                clear_space(department)
                item['department'] = ''.join(department)
                # print("item['department'] = ", item['department'])

                duration = response.xpath(
                    "//strong[contains(text(),'Course Length:')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_list = getIntDuration(''.join(duration).replace("Course Length:", "").strip())
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='rightcontainer']/div[@class='coursefacts']/div/div//p
                overview = response.xpath(
                    "//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='coursecontentarea']/div[@class='courseoverview']").extract()
                item['overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['overview_en']: ", item['overview_en'])
                # if item['overview_en'] == "":
                #     print("****111****")

                modules_en = response.xpath(
                    "//h3[contains(text(),'Course Content')]|//h3[contains(text(),'Course Content')]/following-sibling::div[1]").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath("//h3[contains(text(),'Course content')]/..").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
                # print("item['modules_en']: ", item['modules_en'])
                # if item['modules_en'] == "":
                #     print("****111****")

                assessment_en = response.xpath(
                    "//h3[contains(text(),'Learning & Teaching')]|//h3[contains(text(),'Learning & Teaching')]/following-sibling::div[1]|"
                    "//h3[contains(text(),'Assessment')]|//h3[contains(text(),'Assessment')]/following-sibling::div[1]").extract()
                item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])
                # if item['assessment_en'] == "":
                #     print("****111****")

                career_en = response.xpath(
                    "//h3[contains(text(),'Employability & Careers')]|//h3[contains(text(),'Employability & Careers')]/following-sibling::div[1]").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
                # print("item['career_en']: ", item['career_en'])
                # if item['career_en'] == "":
                #     print("****111****")

                alevel = response.xpath(
                    "//*[contains(text(),'A levels')]//text()|"
                    "//*[contains(text(),'A Levels')]//text()").extract()
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
                item['ib'] = clear_lianxu_space(ib)
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//h3[contains(text(),'Entry Requirements')]/following-sibling::div[1]//text()").extract()
                if len(rntry_requirements) == 0:
                    rntry_requirements = response.xpath(
                        "//h3[contains(text(),'Entry Requirements')]/following-sibling::div[1]//text()").extract()
                    if len(rntry_requirements) == 0:
                        rntry_requirements = response.xpath(
                            "//h3[contains(text(),'Entry Requirements & How to Apply')]/following-sibling::div[1]//text()").extract()
                rntry_requirements = clear_lianxu_space(rntry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                clear_space(ielts)
                # print("ielts: ", ielts)
                if len(ielts) > 0:
                    item['ielts_desc'] = ielts[0]
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                # if len(ielts_list) == 1:
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


                interview_desc_en = response.xpath("//*[contains(text(),'interview')]").extract()
                if len(interview_desc_en) == 0:
                    interview_desc_en = re.findall(r".{1,100}interview.{1,100}", rntry_requirements)
                item['interview_desc_en'] = remove_class(clear_lianxu_space(interview_desc_en)).strip()
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                # http://www.cardiffmet.ac.uk/international/study/applying/Pages/Fees-and-Money-Matters.aspx
                item['tuition_fee'] = '12000'
                item['apply_proces_en'] = 'http://www.cardiffmet.ac.uk/study/adviceforapplicants/undergraduate/Pages/default.aspx'
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                ucascode = response.xpath("//strong[contains(text(),'UCAS Code')]/following-sibling::*[1]//strong//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/..//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/../following-sibling::p//text()|"
                                          "//strong[contains(text(),'UCAS Code')]/../following-sibling::p//text()").extract()

                clear_space(ucascode)
                print("ucascode: ", ucascode)
                item['other'] = ' '.join(ucascode).strip()
                print("item['other'] = ", item['other'])

                ucascode_re = re.findall(r"UCAS\sCode:\w{4}|UCAS\sCodes:\w{4}|UCAS\sCodes:\s\w{4}", ''.join(ucascode).strip())
                print("ucascode_re: ", ucascode_re)
                if len(ucascode_re) > 0:
                    item['ucascode'] = ''.join(ucascode_re).replace("UCAS Code:", "").replace("UCAS Codes:", "").strip()
                print("item['ucascode'] = ", item['ucascode'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: UniversityOfLincoln_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            if "Foundation" not in item['major_type1']:
                # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
                programmeDegreetype = response.xpath("//div[@id='CourseTitleApms']/h1//text()").extract()
                clear_space(programmeDegreetype)
                # print("programmeDegreetype: ", programmeDegreetype)
                if len(programmeDegreetype) > 0:
                    programmeDegreetypeStr = programmeDegreetype[0].strip()

                degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
                # print("degree_type: ", degree_type)
                degree_type_str = ''.join(degree_type).strip()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").replace("(", "").replace(")", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath("//div[@class='nd_2019-20']//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                if len(ucascode) == 0:
                    ucascode = response.xpath("//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace("UCAS Code:", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                # //span[@id='durationFT']
                duration = response.xpath("//div[@class='nd_2019-20']//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                if len(duration) == 0:
                    duration = response.xpath("//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration)

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                department = response.xpath("//span[contains(text(),'School:')]/following-sibling::a//text()").extract()
                clear_space(department)
                if len(department) > 0:
                    item['department'] = department[0]
                # print("item['department']: ", item['department'])

                dep_dict = {"lincoln school of architecture and the built environment": "College of Arts",
    "lincoln school of design": "College of Arts",
    "lincoln school of film and media": "College of Arts",
    "school of english and journalism": "College of Arts",
    "school of fine and performing arts": "College of Arts",
    "school of history and heritage": "College of Arts",
    "school of chemistry": "College of Science",
    "school of computer science": "College of Science",
    "school of engineering": "College of Science",
    "school of geography": "College of Science",
    "school of life sciences": "College of Science",
    "school of mathematics and physics": "College of Science",
    "school of pharmacy": "College of Science",
    "national centre for food manufacturing": "College of Science",
    "lincoln institute for agri-tech": "College of Science",
    "school of education": "College of Social Science",
    "school of health and social care": "College of Social Science",
    "professional development centre": "College of Social Science",
    "lincoln law school": "College of Social Science",
    "school of psychology": "College of Social Science",
    "school of social and political sciences": "College of Social Science",
    "school of sport and exercise science": "College of Social Science",}
                if item['department'] != "Lincoln Business School":
                    item['department'] = dep_dict.get(item['department'].lower())
                # print("item['department']1: ", item['department'])

                if item['department'] == None:
                    item['department'] = ''.join(response.xpath("//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()").extract()).strip()
                # print("item['department']2: ", item['department'])

                # //div[@id='feesTables']/table
                fee = response.xpath("//div[@class='nd_2019-20']//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                if len(fee) == 0:
                    fee = response.xpath(
                        "//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                clear_space(fee)
                # print("fee: ", fee)
                feeStr = ''.join(fee)
                tuitionfee = getTuition_fee(feeStr)
                item['tuition_fee'] = tuitionfee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //h2[contains(text(),'The Course')]/..
                overview = response.xpath("//h2[contains(text(),'The Course')]/..").extract()
                # print("overview: ", overview)
                if len(overview) > 0:
                    item['overview_en'] = remove_class(clear_lianxu_space([overview[-1]]))
                # print("item['overview_en']: ", item['overview_en'])

                modules_en = response.xpath("//a[contains(text(),'Modules')]/../../..").extract()
                modules_en = response.xpath(
                    "//div[@id='collapse62019-20']//div[@class='tab-content clearfix']").extract()

                if len(modules_en) > 0:
                    item['modules_en'] = remove_class(clear_lianxu_space([modules_en[-1]]))
                if item['modules_en'] == "":
                    item['modules_en'] = None
                    print("*** modules_en")
                else:
                    print("===", item['modules_en'])
                    del_cont = re.findall(r"<br>Find out more</p><div><span>.*?</em></span>", item['modules_en'])
                    print("del_cont==", del_cont)
                    if len(del_cont) > 0:
                        for delc in del_cont:
                            item['modules_en'] = item['modules_en'].replace(delc, '<div>').strip()
                print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../..").extract()
                if len(assessment_en) > 0:
                    item['assessment_en'] = remove_class(clear_lianxu_space([assessment_en[-1]]))
                # print("item['assessment_en']: ", item['assessment_en'])

                interview_desc_en = response.xpath(
                    "//a[contains(text(),'Interviews & Applicant Days')]/../../..").extract()
                if len(interview_desc_en) > 0:
                    item['interview_desc_en'] = remove_class(clear_lianxu_space([interview_desc_en[-1]]))
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                alevel = response.xpath(
                    "//*[contains(text(),'GCE Advanced Levels')]/text()|//*[contains(text(),'A Level')]/text()").extract()
                if len(alevel) > 0:
                    item['alevel'] = clear_lianxu_space([alevel[-1]])
                print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//p[contains(text(),'International Baccalaureate')]").extract()
                if len(ib) > 0:
                    item['ib'] = remove_tags(clear_lianxu_space([ib[-1]]))
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//a[contains(text(),'Entry Requirements')]/../../..|//a[contains(text(),'Entry requirements')]/../../..").extract()
                if len(rntry_requirements) > 0:
                    rntry_requirements = remove_tags(clear_lianxu_space([rntry_requirements[-1]]))
                # print("rntry_requirements: ", rntry_requirements)

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                career = response.xpath("//div[@id='CourseCareersApms']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                # if item['ielts_desc'] == "":
                #     item['ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                #     item['ielts'] = 6.0
                #     item['ielts_l'] = 5.5
                #     item['ielts_s'] = 5.5
                #     item['ielts_r'] = 5.5
                #     item['ielts_w'] = 5.5
                # print("******item['ielts_desc']: ", item['ielts_desc'])
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
                if item['ielts'] == "6.5":
                    item['toefl'] = 90
                    item['toefl_l'] = 20
                    item['toefl_s'] = 22
                    item['toefl_r'] = 21
                    item['toefl_w'] = 22
                elif item['ielts'] == "7.0":
                    item['toefl'] = 100
                    item['toefl_l'] = 22
                    item['toefl_s'] = 23
                    item['toefl_r'] = 23
                    item['toefl_w'] = 23
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
                item["require_chinese_en"] = remove_class(clear_lianxu_space(["""<div class="panel">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#countryUndergraduateTab">
<em class="more-less glyphicon glyphicon-menu-down"></em>Undergraduate Entry
</a>
</h4>
</div>
<div id="countryUndergraduateTab" class="panel-collapse collapse">
<div class="panel-body">
<p>Prospective students require one of the following qualifications for entry into year one of an undergraduate degree:</p>
<ul>
<li>Successful completion of a Foundation programme with a minimum of 50% plus an average of 70% or above in High School. Please note that some programmes may require a higher foundation score e.g. 60%.</li>
<li>Successful completion of the first year of a Chinese degree / Diploma with an average grade of 70% or above.</li>
</ul>
<p><strong>&nbsp;</strong></p>
<p><strong>HND Students (BTEC and SQA)</strong></p>
<p>Students who have successfully completed a HND BTEC or SQA qualification may be accepted directly into year two or three of a University of Lincoln undergraduate course on a case by case basis.</p>
<p><strong>Chinese Degree / Diploma</strong></p>
<p>Students who have successfully completed the second or third year of a Chinese Degree or Diploma may be considered for direct entry into year two or three of a University of Lincoln undergraduate course on a case by case basis. For more information, please contact the International Admissions team:&nbsp;<a href="mailto:intadmissions&#64;lincoln&#46;ac&#46;uk">intadmissions&#64;lincoln&#46;ac&#46;uk</a>.</p>
<p>&nbsp;</p>	
<!-- START ADVANCED ENTRY (UNDERGRADUATE) -->
<p><strong>Advanced Entry (Undergraduate)</strong></p>
<p>Depending on your academic background and intended course of study, it may be possible to apply for advanced entry into year 2 or 3 of a University of Lincoln undergraduate course.</p>

<!-- START COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->


<!-- END COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->

<p id="advEntryUgEu">For more information, please contact the Student Administration Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<p id="advEntryUgInternational">For more information, please contact the International Admissions Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<!-- END ADVANCED ENTRY (UNDERGRADUATE) -->
</div>
</div>					
</div>
"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                item['apply_proces_en'] = "http://www.lincoln.ac.uk/home/studywithus/undergraduatestudy/howtoapply/"
                # print("item['apply_proces_en']: ", item['apply_proces_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 20

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programmeDegree_name = response.xpath(
                "//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            if "Foundation" not in programmeDegree_nameStr:
                degree_name = re.findall(r"\(.*\)$|\w+\s\(.*\)$|\w+$",
                                         programmeDegree_nameStr)
                degree_name_str = ''.join(degree_name).strip()
                item['degree_name'] = degree_name_str.replace("(", "").replace(
                    ")", "").replace("Hons", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                programme = programmeDegree_nameStr.replace(
                    degree_name_str, "").strip()
                item['programme_en'] = programme
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath(
                    "//*[contains(text(),'UCAS code')]//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    ucascode_re = re.findall(r"UCAS\scode\s\w{4}",
                                             ''.join(ucascode))
                    # print("ucascode_re: ", ucascode_re)
                    item['ucascode'] = ''.join(ucascode_re).replace(
                        "UCAS code", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                other = response.xpath(
                    "//img[@alt='globe']/../..//text()").extract()
                if len(other) == 0:
                    other = response.xpath(
                        "//td[contains(text(),'Open to UK and EU students. Not currently open to ')]//text()"
                    ).extract()
                item['other'] = clear_lianxu_space(other)
                # print("item['other'] = ", item['other'])

                # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
                # clear_space(start_date)
                # # print("start_date: ", start_date)
                # item['start_date'] = getStartDate(''.join(start_date))
                # # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//img[@alt='Calendar']/../following-sibling::td//text()"
                ).extract()
                if len(duration) == 0:
                    duration = response.xpath(
                        "//img[@alt='Calendar']/../../following-sibling::td//text()"
                    ).extract()
                clear_space(duration)
                # print("duration: ", ''.join(duration))

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //p[contains(text(),'Non-UK/EU (International) application deadline')]
                deadline = response.xpath(
                    "//*[contains(text(),'Application deadline')]//text()|//*[contains(text(),'UCAS deadline')]//text()"
                ).extract()
                clear_space(deadline)
                # print("deadline: ", deadline)
                item['deadline'] = getStartDate(''.join(deadline).replace(
                    "Application deadline",
                    "").replace("is", "").replace("UCAS deadline",
                                                  "").replace(":", "").strip())
                if "2018" not in item['deadline'] and item[
                        'deadline'] != "" and "2019" not in item['deadline']:
                    item['deadline'] = ''.join(deadline).replace(
                        "Application deadline",
                        "").replace("is", "").replace("UCAS deadline",
                                                      "").replace(":",
                                                                  "").strip()
                # print("item['deadline']: ", item['deadline'])

                # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
                # item['location'] = ''.join(location).replace("Study location:", "").strip()
                # print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//h3[contains(text(),'International (Non-EU) Student Fees')]/following-sibling::table//td[contains(text(),'2019/20')]/following-sibling::td[1]//text()|"
                    "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                    "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                    "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", ''.join(tuition_fee))
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(
                        ''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview_en = response.xpath(
                    "//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en)).replace(
                        "<p><button>Make an enquiry</button></p>", "").strip()
                # print("item['overview_en']: ", item['overview_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry')]/@href").extract()
                # print("entry_url: ", entry_url)
                if len(entry_url) != 0:
                    parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                    # print("parse_entry_url: ", parse_entry_url)
                    entry_dict = self.parse_rntry_requirements(parse_entry_url)
                    # print(entry_dict)
                    # item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                    item['ielts_desc'] = entry_dict.get('ielts_desc')
                    item['alevel'] = entry_dict.get('alevel')
                    item['ib'] = entry_dict.get('ib')
                # print("item['ielts_desc']: ", item['ielts_desc'])
                # print("item['alevel']: ", item['alevel'])
                # print("item['ib']: ", item['ib'])

                ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[0]
                    item['ielts_s'] = ielts_list[0]
                    item['ielts_r'] = ielts_list[0]
                    item['ielts_w'] = ielts_list[0]
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) == 5:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[4]
                    item['ielts_r'] = ielts_list[2]
                    item['ielts_w'] = ielts_list[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    "//a[contains(text(),'Module')]/@href").extract()
                # print("modules_url: ", modules_url)
                if len(modules_url) != 0:
                    parse_modules_url = "https://www.sgul.ac.uk" + modules_url[
                        0]
                    # print("parse_modules_url: ", parse_modules_url)
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_modules(parse_modules_url))).strip()
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en_url = response.xpath(
                    "//a[contains(text(),'Studying')]/@href").extract()
                # print("assessment_en_url: ", assessment_en_url)
                if len(assessment_en_url) != 0:
                    parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[
                        0]
                    # print("parse_assessment_en_url: ", parse_assessment_en_url)
                    item['assessment_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_assessment_en(
                                parse_assessment_en_url))).strip()
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en_url = response.xpath(
                    "//a[contains(text(),'Career')]/@href").extract()
                # print("career_en_url: ", career_en_url)
                if len(career_en_url) != 0:
                    parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[
                        0]
                    # print("parse_career_en_url: ", parse_career_en_url)
                    item['career_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_career_en(
                                parse_career_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['career_en']: ", item['career_en'])

                apply_proces_en_url = response.xpath(
                    "//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href"
                ).extract()
                # print("apply_proces_en_url: ", apply_proces_en_url)
                if len(apply_proces_en_url) != 0:
                    parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[
                        0]
                    # print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_apply_proces_en(
                                parse_apply_proces_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 21

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Loughborough University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.lboro.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                "//span[@class='course-info__qualification course-info__qualification--default active']//text()").extract()
            # print("degree_name: ", degree_name)
            item['degree_name'] = ''.join(degree_name).replace(', PG certificate', '').strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme_en = response.xpath(
                "//h1[@class='course-info__heading']/text()").extract()
            clear_space(programme_en)
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            item['department'] = response.meta.get(item['programme_en'])
            if item['programme_en'] == "Finance and Management":
                item['department'] = "Business and Economics"
            print("item['department']: ", item['department'])

            # 授课类型
            # mode = response.xpath(
            #     "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]//text()").extract()
            # clear_space(mode)
            # if len(mode) != 0:
            #     item['teach_time'] = 'fulltime'
            # print("item['teach_time']: ", item['teach_time'])


            start_date = response.xpath(
                "//span[@class='list__text'][contains(text(),'Start date')]/../following-sibling::dd[1]//text()").extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            if len(start_date) > 0:
                item['start_date'] = getStartDate(''.join(start_date[0]))
            # print("item['start_date']: ", item['start_date'])

            tuition_fee = response.xpath(
                "//span[@class='list__text'][contains(text(),'International fee')]/../following-sibling::dd//text()").extract()
            clear_space(tuition_fee)
            # print('tuition_fee: ', tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee_pre'] = '£'
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            # print("item['tuition_fee']: ", item['tuition_fee'])

            location = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Location:')]/following-sibling::dd//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            overview_en = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # modules
            modules_en = response.xpath("//div[@id='study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en']: ", item['modules_en'])

            # teaching_assessment
            assessment_en = response.xpath("//div[@class='course-section course-section--assessed']").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            alevel = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'A-Level')]/following-sibling::dd//text()").extract()
            alevel = response.xpath(
                "//span[@class='list__text'][contains(text(),'Typical offer')]/../following-sibling::dd//text()").extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'IB')]/following-sibling::dd//text()").extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            # //div[@id='china']
            require_chinese_en = response.xpath(
                "//div[@id='china']").extract()
            item['require_chinese_en'] = remove_class(clear_lianxu_space(require_chinese_en))
            # print("item['require_chinese_en'] = ", item['require_chinese_en'])

            item['ielts'] = 6.5
            item['ielts_l'] = 6.0
            item['ielts_s'] = 6.0
            item['ielts_r'] = 6.0
            item['ielts_w'] = 6.0
            if item['programme_en'] == "Communication and Media Studies":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
            elif item['programme_en'] == "Information Management and Business" or item[
                'programme_en'] == "Accounting and Financial Management" or item[
                'programme_en'] == "Management Sciences" or item[
                'programme_en'] == "Retailing, Marketing and Management" or item[
                'programme_en'] == "International Business" or item['programme_en'] == "Finance and Management" or item[
                'programme_en'] == "Economics" or item['programme_en'] == "Business Economics and Finance" or item[
                'programme_en'] == "International Economics" or item['programme_en'] == "Economics with Geography" or \
                            item['programme_en'] == "Economics with Politics" or item[
                'programme_en'] == "Economics with Accounting" or item['programme_en'] == "Economics and Management":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career = response.xpath("//div[@id='career']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = "http://www.lboro.ac.uk/study/undergraduate/apply/"

            # //option[@value='default']//text()    //span[@class='form__option-value']
            duration = response.xpath("//label[contains(text(),'Study options')]/following-sibling::span//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            if len(duration) > 0:
                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[0]).strip()
                print("item['ucascode'] = ", item['ucascode'])
                yield item
            else:
                duration = response.xpath(
                    "//option[@value='default']//text()").extract()
                clear_space(duration)
                print("duration1: ", duration)

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[0]).strip()
                print("item['ucascode']1 = ", item['ucascode'])
                yield item

                duration = response.xpath(
                    "//option[@value='variant']//text()").extract()
                clear_space(duration)
                print("duration2: ", duration)

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[-1]).strip()
                print("item['ucascode']2 = ", item['ucascode'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 22

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.uclan.ac.uk/"
        item['university'] = "University of Central Lancashire"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        # item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/text()"
            ).extract()
            if len(programme) == 0:
                programme = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/text()"
                ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/span/text()"
            ).extract()
            if len(degree_type) == 0:
                degree_type = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/span/text()"
                ).extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            if "Foundation" not in item['programme_en']:
                department = response.xpath(
                    "//h4[contains(text(),'Contact Us')]/following-sibling::*[1]//text()"
                ).extract()
                clear_space(department)
                # print("department: ", department)
                if len(department) > 0:
                    for d in department:
                        if "This course is based in the" in d:
                            item['department'] = d.replace(
                                "This course is based in the", "").strip()
                            break
                # item['department'] = ''.join(department)
                # print("item['department']: ", item['department'])

                duration = response.xpath(
                    "//h4[contains(text(), 'Duration:')]/..//text()").extract(
                    )
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration).strip()
                item['other'] = duration_str

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                location = response.xpath(
                    "//h4[contains(text(), 'Campus')]/following-sibling::p[1]//text()"
                ).extract()
                item['location'] = ''.join(location)
                # print("item['location']", item['location'])

                start_date = response.xpath(
                    "//h4[contains(text(), 'Start Date:')]/following-sibling::p[1]//text()"
                ).extract()
                # print(start_date)
                item['start_date'] = getStartDate(''.join(start_date))
                # print("item['start_date']", item['start_date'])

                overview = response.xpath(
                    "//div[@class='overview']|//div[@id='outline']/div[position()<3]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']", item['overview_en'])
                # if item['overview_en'] == "":
                #     print("*******111")

                entry_requirements = response.xpath(
                    "//div[@class='sevencol']//div[contains(@class,'entry-requirements')]//text()"
                ).extract()
                clear_space(entry_requirements)
                print(entry_requirements)
                rntry_requirements = ''.join(entry_requirements).strip()

                item['alevel'] = None
                if "BTEC Extended Diploma:" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.index(
                        "BTEC Extended Diploma:")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "BTEC Extended Diploma:\xa0" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.index(
                        "BTEC Extended Diploma:\xa0")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "BTEC Extended Diploma" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.
                                                index("BTEC Extended Diploma")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "QCFBED:" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.
                                                index("QCFBED:")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "A Levels" in entry_requirements:
                    alevel = entry_requirements[
                        entry_requirements.index("A Levels") + 1]
                    item['alevel'] = clear_lianxu_space([alevel])
                if item['alevel'] is not None:
                    item['alevel'] = item['alevel'].strip().strip(":").strip()
                print("item['alevel']: ", item['alevel'])

                item['ib'] = None
                if "International Baccalaureate:" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate:") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate Diploma:\xa0" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate Diploma:\xa0") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate :" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate :") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate:\xa0" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate:\xa0") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate Diploma:" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate Diploma:") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                if item['ib'] is not None:
                    item['ib'] = item['ib'].strip().strip(":").strip()
                print("item['ib']: ", item['ib'])
                # if item['ib'] == "":
                #     print("*******111")

                if "IELTS:" in entry_requirements:
                    ieltsList = entry_requirements[entry_requirements.index(
                        "IELTS:"):entry_requirements.index("IELTS:") + 2]
                else:
                    ieltsList = re.findall(r'.{1,50}IELTS.{1,80}',
                                           rntry_requirements)
                item['ielts_desc'] = ''.join(ieltsList).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_list = re.findall(
                    r"[5-9]\.\d\s|[5-9]\.\d,|[5-9]\.\d\.|[5-9]\.\d$|[5-9]\s|[5-9]\.",
                    item['ielts_desc'])
                # print(ielts_list)
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                elif len(ielts_list) == 3:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[2].strip().strip('.').replace(
                        ',', '').strip()
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules = response.xpath("//div[@id='caag']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']", item['modules_en'])
                # if item['modules_en'] == "":
                #     print("*******111")

                # //h3[contains(text(),'Learning Environment and Assessment')]/..
                # assessment_en = response.xpath("//*[contains(text(),'Learning Environment and Assessment')]/..").extract()
                assessment_en = response.xpath(
                    "//*[contains(text(),'Learning Environment and Assessment')]|//*[contains(text(),'Learning Environment and Assessment')]/following-sibling::p"
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']", item['assessment_en'])
                # if item['assessment_en'] == "":
                #     print("*******111")

                # //h4[contains(text(),'Industry Links')]|//h4[contains(text(),'Industry Links')]/following-sibling::*[1]
                career_en = response.xpath(
                    "//h4[contains(text(),'Industry Links')]/..|//h4[contains(text(),'Opportunities')]/..|"
                    "//strong[contains(text(),'Careers')]/../..").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
                # print("item['career_en']", item['career_en'])
                # if item['career_en'] == "":
                #     print("*******career")

                # https://www.uclan.ac.uk/study_here/fees_and_finance/international_tuition_fees.php#international
                item['tuition_fee'] = '12450'
                if item['department'] == "School of Forensic and Applied Sciences" or item['department'] == "School of Physical Sciences and Computing" \
                        or item['department'] == "School of Pharmacy and Biomedical Sciences" or item['department'] == "School of Engineering":
                    item['tuition_fee'] = '13450'
                item['tuition_fee_pre'] = "£"

                item[
                    'require_chinese_en'] = "<h2>Undergraduate – Year 0 entry 200 tariff points/80 tariff points</h2><p>Senior Secondary School Graduation Certificate 60% average</p><p>Completion of second year Senior Secondary School 70% average</p><h2>Undergraduate - Year 1 entry 280 tariff points/112 tariff points</h2><p>3 Year leaving certificate from SeniorHigh School with 75%</p><p>Chinese National University GaoKao University Entrance Test with 450+ (maximum is 750, 150 for each of five subjects)</p><p>Successful completion of one year Higher Education is acceptable in lieu of Grade 3 High School at 85%</p><p>Completion of Shenzhen University International Foundation Programme with 60% - Group B</p>"
                item[
                    'apply_proces_en'] = 'https://www.uclan.ac.uk/study_here/how_to_apply/international.php'

                ucascode = response.xpath(
                    "//h4[contains(text(),'UCAS Code:')]/following-sibling::*//text()"
                ).extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                # print("len: ", len(ucascode))
                # print("item['ucascode'] = ", item['ucascode'])

                if item['programme_en'] != "":
                    yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 23

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath(
                "//span[@property='programname']//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace(
                "\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath(
                "//span[@property='award']//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//th[contains(text(),'UCAS code')]/../td//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])
            if item['ucascode'] == "":
                ucascode = response.xpath(
                    "//th[contains(text(),'Application method')]/following-sibling::td//text()"
                ).extract()
                clear_space(ucascode)
                # print("ucascode1: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace(
                    "Entry by transfer from", "").replace(
                        "Entry by transfer after two years from",
                        "").replace("Entry by transfer after year one of",
                                    "").replace("at the end of year one",
                                                "").strip()
            # print("item['ucascode']1: ", item['ucascode'])

            # duration
            duration = response.xpath(
                "//th[contains(text(),'Course duration')]/../td//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)

            # duration_list = getIntDuration(''.join(duration))
            if len(duration) == 2:
                item['duration'] = int(duration[0])
                if 'y' in ''.join(duration).lower():
                    item['duration_per'] = 1
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath(
                "//th[contains(text(),'Location of course')]/../td//text()"
            ).extract()
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath(
                "//p[@class='year-of-entry']/text()").extract()
            clear_space(startdate)
            # print("startdate = ", startdate)
            if len(startdate) > 0:
                item['start_date'] = ''.join(startdate).replace("entry",
                                                                "").strip()
            # print("item['start_date'] = ", item['start_date'])

            tuitionFee = response.xpath(
                "//li[contains(text(),'International students: £')]//text()"
            ).extract()
            # print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = getTuition_fee(''.join(tuitionFee))
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # deadline
            # deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # # print("deadline = ", deadline)
            # item['deadline'] = getStartDate(''.join(deadline))
            # # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath(
                "//div[@id='contact']/p[@class='pg-contact-address']/text()"
            ).extract()
            clear_space(department)
            print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath(
                    "//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}",
                                           ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            overview = response.xpath(
                "//div[@id='course-description']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath(
                "//div[@id='teaching']|//div[@id='assessment']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'A-level')]/../td//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            # print(len("36 points overall with 18 at Higher Level, including 6, 5 at Higher Level in two of the following subjects: Biology, Chemistry, Physics, Mathematics, Psychology"))
            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            ib = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'International Baccalaureate ')]/../td//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()

            if len(item['ib']) > 160:
                item['ib'] = ''.join(item['ib'][:161])
            # print("item['ib'] = ", item['ib'])

            # 课程结构
            modulesUrl = response.xpath(
                "//div[@id='course-structure']//div[@class='collapsible']//a/@href"
            ).extract()
            # print("modulesUrl: ", modulesUrl)
            modulesUrl = ''.join(modulesUrl).strip()
            if len(modulesUrl) != 0:
                item['modules_en'] = self.parse_modules_en(modulesUrl)[0]
                # print("item['modules_en']: ", item['modules_en'])
                u = self.parse_modules_en(modulesUrl)[1]
                # print(u)
                while len(u) != 0:
                    u1 = "https://www.bris.ac.uk" + ''.join(u)
                    # print("u1=", u1)
                    item['modules_en'] += self.parse_modules_en(u1)[0]
                    u = self.parse_modules_en(u1)[1]
            # print("item['modules_en']1: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath(
                "//div[@id='typical-offer']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath(
                "//*[contains(text(),'Profile A')]//text()|//*[contains(text(),'Profile B')]//text()|"
                "//*[contains(text(),'Profile C')]//text()|//*[contains(text(),'Profile D')]//text()|"
                "//*[contains(text(),'Profile E')]//text()|//*[contains(text(),'Profile F')]//text()"
            ).extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 7.0
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # assessment_en, career_en
            assessCareerUrlSplit = response.url.rsplit('/')
            assessCareerUrl = response.url.replace(
                assessCareerUrlSplit[-2] + "/", "").strip()
            print(assessCareerUrl)
            assessCareerDict = self.parseAssessCareer(assessCareerUrl)

            item['assessment_en'] = assessCareerDict.get(
                'assessment_en').strip()
            print("item['assessment_en']: ", item['assessment_en'])

            item['career_en'] = assessCareerDict.get('career_en').strip()
            print("item['career_en']: ", item['career_en'])

            # 申请要求
            apply_desc_en = response.xpath(
                "//div[@id='typical-offer']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            require_chinese_en = """<h2 id="ugentryreqs">Entry requirements for undergraduate courses</h2>
<p>You can apply for undergraduate programmes either through the&nbsp;<a href="http://www.ucas.com/">Universities and Colleges Admissions Service</a>&nbsp;(UCAS) or&nbsp;<a href="http://www.commonapp.org/">The Common Application.</a>&nbsp;Please use only&nbsp;<strong>one</strong>&nbsp;method of applying. If you are using UCAS to apply for other UK universities, please also make your University of Bristol application through UCAS and do not use the Common Application.The UCAS code name and number for this University is BRISL B78.</p>
<p>Individual course entry requirements&nbsp;are listed in our <a href="http://www.bris.ac.uk/study/undergraduate/">Undergraduate Prospectus</a>&nbsp;for each course.</p>
<ul>
<li>Applicants with the Gaozhong Biye Zhengshu (Senior High School Certificate) and Gaokao&nbsp;(Chinese University entrance exam) combined with a successfully completed appropriate <a href="http://www.bris.ac.uk/english-language/study/ifp/" target="_blank">Foundation programme</a> will be considered for admission to our Bachelor's degree courses.</li>
<li>Applicants who have successfully completed the first year of a Chinese University degree at a prestigious university will be considered for admission to the first year of our Bachelor's degree courses.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>How to apply</h2><p>You can apply through Universities and Colleges Admissions Service (UCAS) or the Common Application (Common App). For Engineering Design, Medicine, Dentistry or Veterinary Science courses, you must apply using UCAS.</p> </div> <!-- end: content - how to apply --> <!-- start: drop down - application options --> <div class="main-col-child">  <div class="dropdown"> <h3 class="dropdown-heading">Applying through UCAS</h3> <div class="dropdown-content"> <p>You can apply for a maximum of five courses using the UCAS form. Apply for medicine, dentistry and veterinary courses through UCAS by 15 October. You can only use four of your five UCAS choices to apply to these courses.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-ucas&amp;ns_type=clickout&amp;ns_url=https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students" href="https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students">Apply online through UCAS</a><br />Our UCAS institution code is <strong>BRISL B78</strong>.</p> <p>After you have applied, UCAS will give you a ten-digit personal ID number. You will need this if you contact the University about your application.</p> <h4>Entering your qualifications</h4> <p>Before you submit your UCAS application, make sure you have included:</p> <ul> <li><strong>Full details of qualifications you have already taken</strong>: include grades/marks for the academic qualifications you've achieved from age 16 (GCSE or equivalent), and any English language qualifications.</li> <li><strong>Full details of the qualifications you are taking:</strong> include current studies (name and expected date of examination and major subjects), English language qualifications, and any resits of previous qualifications you expect to take.</li> </ul> <p>If your qualification offers different levels of study, state which subjects you are studying at the higher level, and which at the standard level.</p> <p>Watch the <a href="https://www.ucas.com/connect/videos?v=/apply-education-page">UCAS how-to guide on entering qualifications</a>.</p> <h4>When to apply</h4> <p>Find the <a href="https://www.ucas.com/ucas/undergraduate/apply-track/when-apply"><span>application deadlines on the UCAS website</span></a>.</p> </div>   <h3 class="dropdown-heading" >Applying through the Common App</h3> <div class="dropdown-content"> <p>You can use Common App to apply for any full-time undergraduate course at Bristol, except Engineering Design, Medicine, Dentistry or Veterinary Science courses. The deadline for applying through Common App is 30 June 2018.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-common&amp;ns_type=clickout&amp;ns_url=https://www.commonapp.org/" href="https://www.commonapp.org/">Apply online through the Common App</a></p> <p>After you have applied, you will be given an application number. You will need this if you contact the University about your application.</p> </div>   <h3 class="dropdown-heading" >Applying for direct entry courses</h3> <div class="dropdown-content"> <p>These are our direct entry courses. Please apply using these links and not through UCAS:</p> <ul> <li><a href="/dental/courses/dcp/hygiene/apply/">Diploma in Dental Hygiene</a></li> <li><a href="http://www.bristol.ac.uk/english-language/study/ifp/apply/">International Foundation Programme</a></li> <li><a href="/arts/study/foundation/apply/">Foundation in Arts and Humanities</a></li></ul> """
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            print("异常：", str(e))
            print("报错链接：", response.url)
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")

Exemplo n.º 24

0

Exibir arquivo

Arquivo: Queen'sUniversityBelfast_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Queen's University Belfast"
        # item['country'] = 'England'
        # item['website'] = 'http://www.qub.ac.uk/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            degree_type = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h2//text()"
            ).extract()
            degree_type = ''.join(degree_type).split("|")
            print("degree_type: ", degree_type)
            if len(degree_type) != 0:
                item['degree_name'] = degree_type[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h1//text()"
            ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            # start_date
            start_date = response.xpath(
                "//span[@class='cf-key-details key-entry-year']//text()"
            ).extract()
            clear_space(start_date)
            item['start_date'] = ''.join(start_date).strip()
            print("item['start_date']: ", item['start_date'])

            # duration
            duration = response.xpath(
                "//p[@class='cf-key-details-duration']//span[@class='cf-key-details']//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']-: ", item['duration'])
            print("item['duration_per']-: ", item['duration_per'])

            ucascode = response.xpath(
                "//span[@class='cf-key-details key-ucas-code']//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # //div[@id='overview']
            overview = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='overview']
            modules = response.xpath(
                "//h3[@class='alt'][contains(text(),'Course Structure')]/following-sibling::table"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[@class='alt'][contains(text(),'Career Prospects')]/following-sibling::p[1]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //a[@id='teaching']/following-sibling::*[position()<6]
            teaching_assessment = response.xpath(
                "//a[@id='teaching']/following-sibling::*[position()<6]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment).replace("\n", ""))
            # print("item['assessment_en']: ", item['assessment_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry']//text()").extract()
            rntry_requirements = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELTS.{1,150}", rntry_requirements)
            item['ielts_desc'] = ''.join(ielts)
            print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(''.join(ielts))
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//b[contains(text(),'Entry requirements:')]/following-sibling::span//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()").extract()
            # item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            # //html//div[@id='fees']//tr[4]
            tuition_fee = response.xpath(
                "//html//div[@id='fees']//tr[4]//text()").extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_str = ''.join(tuition_fee).strip().strip(
                "International")
            if "£" in tuition_fee_str:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(
                    tuition_fee_str.replace('£', '').replace(',', '').strip())
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='panel bg--primary']//div[@class='inner']//p
            department = response.xpath(
                "//div[@class='panel bg--primary']//div[@class='inner']//p//text()"
            ).extract()
            clear_space(department)
            # print(department)
            for d in department:
                if "School" in d:
                    item['department'] = d.strip()
                elif "College" in d:
                    item['department'] = d.strip()
                elif "Campus" in d or d == "Biological Sciences" or d == "Marketing strategy" or d == "Management":
                    item['department'] = d.strip()
                elif len(d) == 4 or len(
                        d
                ) == 5 or d == "Arts, English and Languages" or d == "Global Food Security" or d == "Centre for Economic History":
                    item['department'] = d.strip()
            # print("item['department']: ", item['department'])

            department = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']//a//text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # //html//div[@class='panel bg--grey-l']/div[@class='inner']/p[1]
            location = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']/text()"
            ).extract()
            clear_space(location)
            item['location'] = '\t'.join(location).strip()
            print("item['location']: ", item['location'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h1 class="alt"><a name="UG"></a>Undergraduate entry requirements</h1>
<p>The following qualifications will be considered for direct entry to undergraduate programmes:</p>
<ul>
<li>Students who have completed 12 years of education in China and attained the Secondary School Leaving Certificate with good grades must complete an approved Foundation programme or GCE A Levels for progression to undergraduate degree programmes.</li>
<li>The 'Gaokao' Chinese University Entrance Examination will be considered, along with performance in the Senior High School examination for entry to Stage 1 of our undergraduate programmes.</li>
<li>Progression to Stage 1 of an undergraduate degree programme at Queen's&nbsp;(with the exception of Agricultural Technology, Medicine, Dentistry and Social Work) is guaranteed for students who successfully complete the <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Foundation Programme</a> at the required standard.</li>
<li>Students who have completed one or two years of university study in China may be eligible for admission to Bachelor degree programmes, if relevant subjects have been studied and strong grades have been achieved.</li>
<li>Applicants who have already completed A-Levels/a recognised Foundation programme or the first year of a relevant degree programme in China, but who do not meet the academic or English language requirements for entry, may wish to consider <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Year One</a>. Successful completion at the required standard offers direct entry to the second year of selected undergraduate degree programmes in Management, Economics, Finance and Engineering.</li>
<li>Between 30 and 36 points in the International Baccalaureate Diploma (IB). <a href="/home/International/International-students/Your-Country/InternationalBaccalaureateIBDiplomaEntryRequirements/">Information on required grades</a>.</li>
</ul>
<p><strong>Please note: </strong>Grades required vary depending on the programme of study. Further guidance on the entry requirements for each degree programme can be found in the Undergraduate Coursefinder.</p>"""
                ]))

            apply_proces_en = response.xpath("//div[@id='apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: UniversityOfChester_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            start_date = response.xpath(
                "//select[@id='edit-date']/option//text()|//label[@for='edit-date']/following-sibling::span//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ""
            if len(start_date) > 0:
                for s in start_date:
                    # start_date_str = getStartDate(s)
                    if getStartDate(s) != None:
                        start_date_str += getStartDate(s) + ", "
            item['start_date'] = start_date_str.strip().strip(',').strip()
            # print("item['start_date']: ", item['start_date'])

            mode = response.xpath(
                "//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            # print("mode: ", mode)

            location = response.xpath(
                "//label[@for='edit-compulsory']/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            ucascode = response.xpath(
                "//dt[contains(text(),'UCAS Code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()"
            ).extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//td[contains(text(),'GCE A Level')]/following-sibling::*//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//td[contains(text(),'International Baccalaureate')]/following-sibling::*//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            ielts_desc = response.xpath(
                "//div[@id='entry-international']//li[contains(text(),'Undergraduate:')]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//div[@class='field-fees-international']/p//text()|"
                "//p[contains(text(),'The tuition fees for international students studyi')]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath(
                "//div[@id='careers-career-services']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            modules = re.findall(
                r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}",
                response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace(
                "function init_drupal_core_settings() {jQuery.extend(Drupal.settings,",
                "").strip()
            modules_dict = json.loads(modules_str)
            # print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            # print(modules_dict.get("courses"))
            if modules_dict.get('courses').get('groupCode') is not False:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/" + modules_dict.get(
                    'courses').get('modulesNid') + "/" + modules_dict.get(
                        'courses').get('groupCode') + "/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            # print("item['modules_en']: ", item['modules_en'])

            item[
                'apply_proces_en'] = "https://www1.chester.ac.uk/undergraduate/how-apply/applying-full-time-courses"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Undergraduate Study  </h3>

  <ul><li>UK foundation/pathway course with a pass mark of 50% and above.  Engineering courses require an additional mark of at least 55% in a Maths module. </li>
<li>China 3 year National Senior High School Certificate with 80% or above</li>
<li>Gaokao (College Entry Exam) with good grades </li>
<li>Dazhuan considered for entry to 3rd year UG</li>
<li>BFSU Foundation Year at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Foundation Course 60% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Foundation Certificate at 60% or above</li>
<li>Graduation Certificate from a specialised College/School (Zhongzuhan) with 80% or above</li>
</ul></div>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 26

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            ucascode = re.findall(r"\w{4}$", programmeDegree_typeUcascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').replace(item['ucascode'], "").strip()
            item['programme_en'] = programme.title()
            print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # start_date = response.xpath("//p[@class='headline'][contains(text(), 'Start date')]//text()").extract()
            # # print(start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview2))
            # if item['overview_en'] == "":
            #     print("***overview_en")
            # print("item['overview_en']: ", item['overview_en'])

            # department
            item['department'] = response.meta['department']
            # print("item['department']: ", item['department'])

            if item['department'] == "":
                department = response.xpath(
                    "//aside[contains(@class,'pane base4 m-margin-bottom')]//div[contains(@class,'row-small')]//p[contains(text(), 'School')]/following-sibling::*//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                item['department'] = item['department'].replace(
                    "How to apply",
                    "").replace("Visit the",
                                "").replace("website",
                                            "").strip().strip('.').strip()
                # print("item['department']1: ", item['department'])
            # if item['department'] == "":
            #     print("***department")

            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//span[contains(text(),'entry requirements')]/../../.."
            ).extract()
            entry = ''.join(entry_requirements).strip()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # if item['apply_desc_en'] == "":
            #     print("apply_desc_en 为空")
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h4[contains(text(),'Typical')]/following-sibling::*[1]//text()|//h4[contains(text(),'A level')]/following-sibling::*[1]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # if item['alevel'] == "":
            #     print("alevel 为空")
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # if item['ib'] == "":
            #     print("ib 为空")
            # print("item['ib']: ", item['ib'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            ielts = response.xpath(
                "//*[contains(text(),'IELT')]//text()").extract()
            if ''.join(ielts).strip() == "IELTS":
                ielts = response.xpath(
                    "//*[contains(text(),'IELT')]/following-sibling::*[1]//text()"
                ).extract()
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # if item['toefl_desc'] == "":
            #     item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            # toeflDict = get_toefl(item['toefl_desc'])
            # item['toefl'] = toeflDict.get("TOEFL")
            # item['toefl_l'] = toeflDict.get("TOEFL_L")
            # item['toefl_s'] = toeflDict.get("TOEFL_S")
            # item['toefl_r'] = toeflDict.get("TOEFL_R")
            # item['toefl_w'] = toeflDict.get("TOEFL_W")
            # # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'New international students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(''.join(tuition_fee_re).replace(
                    "£", "").replace(",", "").strip())

            if item['tuition_fee'] == 0:
                item[tuition_fee] = None
            else:
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="row row-margin-small row-margin-title-10">

                        <h1 class="text-transform-uppercase text-size-30 m-text-size-25 text-weight-medium display-inline text-bg-standout text-nice-wrap">
                            <span class="text-bg-standout">How to apply for undergraduate courses</span>
                        </h1>

                    </div>
                                    <div class="theme-editor theme-editor-break-word">
                        You can apply online for all of our courses via the national admissions service, <a href="http://www.ucas.com">UCAS</a>. You can choose to apply for up to five courses in total, including more than one course at the same institution. <br />
<h4>When to apply&nbsp;</h4>
<p>UK or EU students: You should aim to apply via UCAS between 1 September and 15 January for admission in September 2018. If you have missed the 15 January deadline, there is still the opportunity to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<p>
International students: You should aim to apply via UCAS between 1 September 2017 and 15 January 2018 for admission in September 2018, though applying before 15 January is encouraged in order to ensure you have time to prepare for studying in the UK. However, if you have missed the 15 January deadline, you are still welcome to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<h4>UCAS code</h4>
<p>Our UCAS code is R12. The University does not have a campus code.&nbsp;</p>
<h4>UCAS costs</h4>
<p>There is a small charge made by UCAS for applying to university. The application fee is &pound;13 if you&rsquo;re applying to just one course, or &pound;24 for multiple courses and for late applications sent after 30 June.</p>
<h4>Entry requirements</h4>
<p>Please read our <a href="/ready-to-study/study/how-to-apply/entry-requirements-ug.aspx">entry requirements page</a> for more information on accepted qualifications.</p>
<h4>English language requirements</h4>
<p>If English is not your first language, you can find out more information on our <a href="/ready-to-study/international-and-eu/english-language-requirements.aspx">English language requirements</a> page.</p>
                    </div>
                                    <div class="row-large paddingtop-small pad-sides border-top-light">
                        <div class="visuallyhidden" id="show-more-094422b2-b9da-4602-9594-80e05dba925c" aria-hidden="true">
                            <div class="theme-editor">
                                <h4>The application process&nbsp;
</h4>
<p>Once UCAS receives your application, it sends it to our Admissions Office, who assess it and decide whether to offer you a place. The way we assess your application will differ from course to course, but we will use the information supplied in your application form including your personal statement, predicted and achieved grades and the reference supplied by your school or college.&nbsp;</p>
<p>We carefully consider every application so please don’t worry if you don’t hear back from us straight away. We aim to make a decision on all applications within four weeks, and you will be able to track the progress of your application on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a>.&nbsp;</p>
<p>We will email you with the outcome of your application and confirm this with UCAS so that you can see the decision online using UCAS Track. If we offer you a place, we will explain any conditions attached to that offer (for example, the need to achieve certain grades in your examinations).&nbsp;</p>
<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>
<h4>Visit Days</h4>
<p> If you are offered a place to study at the University of Reading without an interview, we will invite you to attend a Visit Day in your department of choice. Visit Days take place between November and March and will usually include a tour of our campus and facilities, a visit to a hall of residence, and the chance to meet academic staff and current students.&nbsp;</p>
<h4>Choosing offers&nbsp;</h4>
<p>Once you have heard from all of the universities that you applied to, UCAS will ask you which offer you want to accept. Most people choose two: one as your ‘firm’ or first choice, the other as your ‘insurance’ or second choice. If you meet the conditions of your offer, you will automatically be accepted onto your firm choice course.&nbsp;</p>
<h4>Confirmation of your place&nbsp;</h4>
<p>Most offers are conditional on exam results. If you meet the conditions set out in our offer, your place is assured and you will see this on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a> . If you do not meet the conditions set out in your offer, you may still be able to get on the course. We will let you know as soon as possible after we have received your results.&nbsp;</p>
<h4>Gap year/deferred entry&nbsp;</h4>
<p>We welcome deferred entry applications. You need to apply at the same time as if you were planning to go straight to university, but you should state in your UCAS application that you wish to be considered for deferred admission.</p>
                            </div></div></div>"""
                ]))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['interview_desc_en'] = remove_class(
                clear_lianxu_space([
                    """<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>"""
                ]))
            print("item['interview_desc_en']: ", item['interview_desc_en'])

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2 class="trigger">Entry requirements</h2>
<table summary="A table outlining the basic entry requirements for courses at the University of Reading based on the qualifications offered in your country">
<tbody>
<tr><!-- HEADINGS-->
<td class="top-head"><strong>Your highest qualification</strong></td>
<td class="top-head"><strong>Likely entry level</strong></td></tr>
<tr><!-- FIRST ROW -->
<td>
<p><!-- EG FIRST ROW FIRST COLUMN INFO -->High School year 2 (Year 11) with leaving certificate: GPA 85%<br/>High School year 3 (Year 12) with graduation certificate: GPA 80%</p></td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr class="even"><!-- SECOND ROW -->
<td>Gao Kao (Chinese University Entrance Exam) 80%</td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr><!-- THIRD ROW -->
<td>Gau Cau (Chinese University Entrance Exam) combined with a successfully completed appropriate foundation/bridging programme. (Visit our <a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a>) </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- FOURTH ROW -->
<td>International Baccalaureate (IB) Diploma </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- FIFTH ROW -->
<td>British/International A Levels </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- SIXTH ROW -->
<td>Chinese-medium A Levels in Mathematics and Sciences (Cambridge Examinations Board) </td>
<td>Undergraduate Degree (Bachelors Degree) in a relevant subject </td></tr>
<tr>
<td>Ameson: High school results of 85% if 11 years completed, 80% if 12 years (with similar grades in relevant subjects), AST Maths: 165 and AST English: 150</td>
<td>Undergraduate Degree (Bachelors Degree)</td></tr>
<tr class="even"><!-- EIGHTH ROW -->
<td>Other international qualifications such as Australian HSC, US SAT or AP Certificates</td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- NINTH ROW -->
<td>Successfully completed first year of a Chinese University degree </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- TENTH ROW -->
<td>4-year Bachelor degree </td>
<td>Taught Postgraduate (Masters and Doctoral Degree) </td></tr>
<tr>
<td>&nbsp;Masters degree study </td>
<td>&nbsp;Research Postgraduate (Doctoral Degree) </td></tr></tbody></table>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 27

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Sheffield"
        # item['country'] = 'England'
        # item['website'] = 'https://www.sheffield.ac.uk'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Western Bank, Sheffield, S10 2TN, UK"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programmeDegree_type = response.xpath(
                "//div[@class='titles']/h2//text()").extract()
            if len(programmeDegree_type) == 0:
                programmeDegree_type = response.xpath(
                    "//main[@class='main content']/h2[1]//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type).strip()
            print("programmeDegree_type: ", programmeDegree_type)
            degree_typeList = re.findall(r"[A-Za-z/\(\)]*$",
                                         programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type
                programme = programmeDegree_type.replace(
                    item['degree_name'], '')
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            department = response.xpath(
                "//div[@class='titles']//h3//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            ucascode = response.xpath("//span[@id='adCode']//text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # 课程长度
            durationContent = response.xpath(
                "//h3[contains(text(),'Course details')]/following-sibling::text()"
            ).extract()
            clear_space(durationContent)
            # print(durationContent)

            duration_list = getIntDuration(''.join(durationContent))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # 专业描述
            overview = response.xpath("//div[@class='descHold']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            alevel = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'A Levels')]/following-sibling::td//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltDict.get('IELTS')
            item["ielts_l"] = ieltDict.get('IELTS_L')  # float
            item["ielts_s"] = ieltDict.get('IELTS_S')  # float
            item["ielts_r"] = ieltDict.get('IELTS_R')  # float
            item["ielts_w"] = ieltDict.get('IELTS_W')
            # print("ielts = %s  ielts_l = %s  ielts_s = %s  ielts_r = %s  ielts_w = %s"%(
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            modules_en = response.xpath("//div[@id='modules']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath("//div[@id='ltam']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//div[@id='graduates']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # https://www.sheffield.ac.uk/prospectus/courseDetails.do?id=N1202019
            # # start_date //a[@href='#tab00']
            # start_date = response.xpath(
            #     "//table[@class='cms-tabs']/tbody/tr[last()]/th[1]//text()").extract()
            # clear_space(start_date)
            # start_date_str = ''.join(start_date).replace('start', '').replace('entry', '').strip()
            # # print(start_date_str)
            # start_date_1 = getStartDate(start_date_str)
            # print(start_date_1)
            # item['start_date'] = start_date_1
            # print("item['start_date']: ", item['start_date'])

            # //div[@id='tab00']
            # modules   评估方式

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h1>How to apply: applying essentials</h1>
    <p><img class="imgRight" alt="Undergraduates in a tutorial"   src="/polopoly_fs/1.550384!/image/undergraduates320.jpg" />This page provides key information about applying to study on an undergraduate course at Sheffield, and contains links to all of our procedures and Admissions policies. Please take the time to read this information before completing your application.</p>
    <h3>Before you apply</h3>
    <p>We normally expect applicants to offer three full A Levels or an accepted equivalent qualification. You can check the University's general entry requirements, including which UK and International qualifications we accept and our English language and mathematics requirements, on our Admissions requirements webpage:</p>
    <p><a  href="/undergraduate/apply/requirements">Admissions requirements</a></p>
    <p>You can find details of the entry requirements for particular courses in our online prospectus. Please note that these represent our typical offer conditions only – we may make different offers in some cases.</p>
    <p><a href="http://www.sheffield.ac.uk/prospectus">Online prospectus</a></p>
    <p>A full list of our formal policies relating to Admissions is available on our Policies webpages. This includes our Student Admissions Policy as well as policies on A Level subject combinations, resits, and qualifications taken early.</p>
    <p><a  href="/undergraduate/policies">Our policies</a></p>
    <h3>Applying</h3>
    <p>You can apply for an undergraduate course at Sheffield via UCAS (the Universities and Colleges Admissions Service):</p>
    <p><a href="http://www.ucas.com/apply">UCAS website – Apply</a></p>
    <p>Applications for places on courses starting the following September (except Medicine and Dentistry) should be submitted to UCAS between:</p>
    <ul>
        <li>1 September and 15 January to be guaranteed equal consideration with other applicants</li>
        <li>16 January and 30 June for further consideration, although we may not be able to consider your application if all the places on the course you have applied for have been filled</li>
    </ul>
    <p>Applications for places on Medicine and Dentistry courses must be submitted between 1 September and 15 October.</p>
    <p>You can find more information about how and when to apply on our Applying webpage. This also contains information about deferred entry, direct entry to year/level 2 and our foundation year courses.</p>
    <p><a  href="/undergraduate/apply/applying">Applying</a></p>
    <p>Our Education For All webpage provides information on the support we provide for Care Leavers, estranged students, carers, mature students and students with a disability or learning difficulty. You can also find information about our outreach activities, our use of contextual data and our Disrupted Studies scheme.</p>
    <p><a  href="/undergraduate/apply/wp">Education for all: Widening Participation and Disrupted Studies</a></p>
    <h3>After you apply</h3>
    <p>You can find out what happens after you have submitted your application on our <a  href="/undergraduate/apply/after">After You Apply</a> webpages. If we make you a Conditional offer and you accept us as either your Firm or Insurance choice, we will also send you an email containing information about what happens when you get your exam results.</p>
    <p><a  href="/undergraduate/apply/after">After you apply</a></p>
    <p>If at any time you find that your studies are&#160;affected by personal, social or domestic issues, please let us know by using our Disrupted Studies form:</p>
    <p><a  href="/undergraduate/apply/applying/disrupted">Disrupted Studies</a></p>
    <h3>Further information</h3>
    <p>If you have any further questions about the University and applying to study with us, please <a href="http://ask.sheffield.ac.uk">Ask Sheffield</a>.</p>
    <p>If you still need help, our Applicant Information Desk (AiD) provides a first point of contact for people who have applied to the University. AiD can help with any questions you have about the process of applying to us and the current status of your application.</p>
    <p><a  href="/aid">Applicant Information Desk</a></p>
    <p>We wish you the best of luck with your application.</p>"""
                ]))
            item['require_chinese_en'] = ''

            tuition_feeDict = {
                "C180": "21450",
                "C200": "21450",
                "C300": "21450",
                "C100": "21450",
                "C109": "21450",
                "C189": "21450",
                "C209": "21450",
                "C309": "21450",
                "C1C9": "21450",
                "C1CX": "21450",
                "C1R9": "21450",
                "C101": "21450",
                "F400": "18900",
                "FV41": "18900",
                "VV46": "18900",
                "VR47": "18900",
                "VR41": "18900",
                "VR42": "18900",
                "F410": "18900",
                "VR44": "18900",
                "QV84": "18900",
                "F401": "18900",
                "KK13": "21450",
                "K100": "21450",
                "ARCU123": "21450",
                "ARCU124": "21450",
                "ARCU13": "21450",
                "ARCU129": "21450",
                "Y001": "16800",
                "H130": "21450",
                "G500": "21450",
                "H690": "21450",
                "H660": "21450",
                "H310": "21450",
                "H360": "21450",
                "H361": "21450",
                "H1NF": "21450",
                "H1NF": "21450",
                "HN62": "21450",
                "OG31": "21450",
                "8L16": "21450",
                "57": "21450",
                "2G36": "21450",
                "8M74": "21450",
                "2A47": "21450",
                "H653": "21450",
                "H659": "21450",
                "B900": "21450",
                "B909": "21450",
                "H810": "21450",
                "H800": "21450",
                "H840": "21450",
                "H8T9": "21450",
                "H8F1": "21450",
                "H8J7": "21450",
                "H801": "21450",
                "F100": "21450",
                "F105": "21450",
                "F107": "21450",
                "F106": "21450",
                "F335": "21450",
                "F109": "21450",
                "F108": "21450",
                "C720": "21450",
                "H210": "21450",
                "HK21": "21450",
                "H2T9": "21450",
                "H200": "21450",
                "H202": "21450",
                "HK2D": "21450",
                "H2N2": "21450",
                "2H26": "21450",
                "8T63": "21450",
                "8L55": "21450",
                "2G91": "21450",
                "H201": "21450",
                "A200": "21450",
                "G600": "21450",
                "G650": "21450",
                "G402": "21450",
                "G400": "21450",
                "GG41": "21450",
                "GG74": "21450",
                "G4G1": "21450",
                "G700": "21450",
                "G490": "21450",
                "G495": "21450",
                "G401": "21450",
                "G651": "21450",
                "GN52": "21450",
                "GN53": "21450",
                "X301": "16800",
                "F401": "18900",
                "Q305": "16800",
                "Q310": "16800",
                "F901": "18900",
                "L701": "18900",
                "V101": "16800",
                "Q307": "16800",
                "V501": "16800",
                "L301": "16800",
                "L401": "16800",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
                "QC19": "21450",
                "B990": "21450",
                "C801": "21450",
                "V642": "16800",
                "L432": "16800",
                "T210": "16800",
                "T300": "18900",
                "TN42": "18900",
                "T110": "16800",
                "T415": "16800",
                "TN12": "18900",
                "T1T2": "16800",
                "T4T2": "16800",
                "T1R2": "16800",
                "T2R2": "16800",
                "T1R4": "16800",
                "T2R4": "16800",
                "T1R7": "16800",
                "T2R7": "16800",
                "T1R1": "16800",
                "TV11": "16800",
                "TV21": "16800",
                "L100": "16800",
                "LV15": "16800",
                "LL12": "16800",
                "L101": "16800",
                "LG11": "16800",
                "L1N3": "16800",
                "LIN3": "16800",
                "X300": "16800",
                "X301": "16800",
                "H620": "21450",
                "H621": "21450",
                "H610": "21450",
                "H613": "21450",
                "H614": "21450",
                "H651": "21450",
                "H647": "21450",
                "H645": "21450",
                "H6T9": "21450",
                "H623": "21450",
                "H615": "21450",
                "H616": "21450",
                "H652": "21450",
                "H649": "21450",
                "H622": "21450",
                "H611": "21450",
                "H648": "21450",
                "H629": "21450",
                "H628": "21450",
                "H602": "21450",
                "H603": "21450",
                "H100": "21450",
                "H104": "21450",
                "H675": "21450",
                "H673": "21450",
                "H67I": "21450",
                "H67H": "21450",
                "Q3Q1": "16800",
                "QL33": "16800",
                "QR14": "16800",
                "QR17": "16800",
                "QR32": "16800",
                "QR37": "16800",
                "QV15": "16800",
                "QT12": "16800",
                "Q304": "16800",
                "Q310": "16800",
                "Q305": "16800",
                "Q306": "16800",
                "QR31": "16800",
                "QV31": "16800",
                "QW33": "16800",
                "QV35": "16800",
                "QR34": "16800",
                "QW34": "16800",
                "Q307": "16800",
                "F309": "21450",
                "G109": "21450",
                "QR11": "16800",
                "R120": "16800",
                "RL11": "16800",
                "RL12": "16800",
                "RN12": "16800",
                "RR12": "16800",
                "RR14": "16800",
                "RR17": "16800",
                "RV11": "16800",
                "RV15": "16800",
                "RW13": "16800",
                "R1R9": "16800",
                "R1T2": "16800",
                "R1R7": "16800",
                "R1RR": "16800",
                "R1RO": "16800",
                "L700": "18900",
                "F800": "18900",
                "F902": "18900",
                "F900": "18900",
                "F901": "18900",
                "QR12": "16800",
                "R220": "16800",
                "RL21": "16800",
                "RL22": "16800",
                "RN22": "16800",
                "RR24": "16800",
                "RR27": "16800",
                "RV21": "16800",
                "RV25": "16800",
                "RW23": "18900",
                "R2R9": "16800",
                "R2T2": "16800",
                "R2R7": "16800",
                "R2RR": "16800",
                "R2R3": "16800",
                "R410": "16800",
                "RL42": "16800",
                "RN42": "16800",
                "RL41": "16800",
                "R4T2": "16800",
                "R4R7": "16800",
                "R4RR": "16800",
                "V100": "16800",
                "RV71": "16800",
                "RV41": "16800",
                "VV15": "16800",
                "VL12": "16800",
                "VL13": "16800",
                "V1R9": "16800",
                "V101": "16800",
                "B620": "21450",
                "QC18": "21450",
                "QC19": "21450",
                "P110": "18900",
                "P500": "18900",
                "K3K4": "18900",
                "KC39": "18900",
                "M100": "16800",
                "ML94": "16800",
                "M1R4": "16800",
                "M1R2": "16800",
                "M1R1": "16800",
                "M930": "16800",
                "M120": "16800",
                "N200": "16800",
                "N420": "16800",
                "NG21": "16800",
                "NG41": "16800",
                "NL21": "16800",
                "NL41": "16800",
                "NP21": "16800",
                "NP41": "16800",
                "NT22": "16800",
                "N120": "16800",
                "JH51": "21450",
                "J500": "21450",
                "J5R9": "21450",
                "FH21": "21450",
                "J200": "21450",
                "FHF1": "21450",
                "H403": "21450",
                "H401": "21450",
                "JH5P": "21450",
                "JH56": "21450",
                "J501": "21450",
                "G100": "18900",
                "G103": "18900",
                "GN13": "18900",
                "G102": "18900",
                "G1R4": "18900",
                "G1R1": "18900",
                "G1R2": "18900",
                "G106": "18900",
                "VG51": "18900",
                "A100": "21450",
                "T900": "16800",
                "C400": "21450",
                "C500": "21450",
                "C440": "21450",
                "C700": "21450",
                "C741": "21450",
                "CC45": "21450",
                "CC74": "21450",
                "CC75": "21450",
                "C709": "21450",
                "CC7C": "21450",
                "CC79": "21450",
                "C409": "21450",
                "CC4C": "21450",
                "C749": "21450",
                "C509": "21450",
                "C449": "21450",
                "C790": "21450",
                "C791": "21450",
                "CC47": "21450",
                "CC4R": "21450",
                "C431": "21450",
                "C433": "21450",
                "C521": "21450",
                "C523": "21450",
                "W302": "18900",
                "RW43": "18900",
                "VW53": "18900",
                "WT34": "18900",
                "WT31": "18900",
                "WTH4": "18900",
                "B991": "21450",
                "B740": "21450",
                "B990": "21450",
                "B520": "21450",
                "QV36": "16800",
                "RV26": "16800",
                "QV16": "16800",
                "VW63": "16800",
                "VV56": "16800",
                "VR61": "16800",
                "BIBU08": "16800",
                "V641": "16800",
                "V500": "16800",
                "RV45": "16800",
                "V501": "16800",
                "F300": "21450",
                "F301": "21450",
                "F344": "21450",
                "F350": "21450",
                "FF35": "21450",
                "F371": "21450",
                "F3F5": "21450",
                "FV35": "21450",
                "F321": "21450",
                "F3G4": "21450",
                "F3GK": "21450",
                "F305": "21450",
                "F304": "21450",
                "F3F5": "21450",
                "L210": "16800",
                "LL23": "16800",
                "LV25": "16800",
                "L201": "16800",
                "LL24": "16800",
                "C800": "21450",
                "C802": "21450",
                "C801": "21450",
                "R710": "16800",
                "RL71": "16800",
                "RL72": "16800",
                "RN72": "16800",
                "RR47": "16800",
                "R7R7": "16800",
                "R7RR": "16800",
                "RV75": "16800",
                "RW73": "18900",
                "R7T2": "16800",
                "L300": "16800",
                "LL43": "16800",
                "NL2K": "16800",
                "NL24": "16800",
                "L391": "16800",
                "L301": "16800",
                "L401": "16800",
                "L722": "16800",
                "TRPU105": "16800",
                "LK74": "18900",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
            }
            tuition_fee = tuition_feeDict.get(item['ucascode'])
            print("tuition_fee: ", tuition_fee)
            if tuition_fee != None:
                item['tuition_fee'] = int(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: NottinghamTrentUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.ntu.ac.uk/"
        item['university'] = "Nottingham Trent University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        # print(response.url)
        print(item['url'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='course-heading page-heading']//text()").extract(
                )
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h2[@class='js_qualification']/strong//text()").extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name'] = ", item['degree_name'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[3]/span
            location = response.xpath(
                "//span[@class='location save']//text()").extract()
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting:')]/following-sibling::span//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = ''.join(start_date)
            # print("item['start_date'] = ", item['start_date'])
            item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date']1 = ", item['start_date'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath(
                "//div[@id='what-you-will-study']/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # modules   课程设置
            modules = response.xpath(
                "//div[@id='what-you-will-study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # teaching_assessment   评估方式
            teaching_assessment = response.xpath(
                "//div[@id='how-youre-taught']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # career   评估方式
            career = response.xpath(
                "//div[@id='careers-and-employability']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //div[@id='entry-requirements-1']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-0']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['apply_desc_en'] = ", item['apply_desc_en'])

            # //div[@id='entry-requirements-1']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply-1']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]
            interview_desc_en = response.xpath(
                "//div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # deadline
            deadline = response.xpath(
                "//div[@id='how-to-apply-1']//p//strong[contains(text(),'Application closing date')]/../following-sibling::p[1]//text()|//div[@id='how-to-apply-1']//h3[contains(text(),'Application deadline')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ''.join(deadline)
            item['deadline'] = getStartDate(deadline_str)
            # print("item['deadline'] = ", item['deadline'])

            alevel = response.xpath(
                "//div[@id='entry-requirements-0']//li[contains(text(),'A-levels')]//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            # https://www.ntu.ac.uk/international/scholarships-and-fees/tuition-fees
            tuition_fee = response.xpath(
                "//html//div[@id='fees-and-funding-1']//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee = getTuition_fee(''.join(tuition_fee))
            item['tuition_fee'] = tuition_fee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            departmentDict = {
                "Economics with Business":
                "Nottingham Business School",
                "Animal Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Applied Anthrozoology":
                "School of Animal, Rural and Environmental Sciences",
                "Biodiversity Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Endangered Species Recovery and Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance, Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Global Food Security and Development":
                "School of Animal, Rural and Environmental Sciences",
                "Architecture":
                "School of Architecture, Design and the Built Environment",
                "Architecture (ARB/RIBA Part 2)rch":
                "School of Architecture, Design and the Built Environment",
                "Building Surveying":
                "School of Architecture, Design and the Built Environment",
                "Civil Engineering":
                "School of Architecture, Design and the Built Environment",
                "Construction Management":
                "School of Architecture, Design and the Built Environment",
                "Construction Project Management (Online)":
                "School of Architecture, Design and the Built Environment",
                "Interior Architecture and Design":
                "School of Architecture, Design and the Built Environment",
                "International Real Estate Investment and Finance":
                "School of Architecture, Design and the Built Environment",
                "Planning and Development":
                "School of Architecture, Design and the Built Environment",
                "Project Management (Construction)":
                "School of Architecture, Design and the Built Environment",
                "Quantity Surveying":
                "School of Architecture, Design and the Built Environment",
                "Real Estate":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Management":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Materials":
                "School of Architecture, Design and the Built Environment",
                "Animation":
                "School of Art & Design",
                "Commercial Photography":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Branding and Identity":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Design":
                "School of Art & Design",
                "Fashion Knitwear Design":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design":
                "School of Art & Design",
                "Illustration":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "Textile Design Innovation":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fashion and Textile Design":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design Theory and Practice":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "PG Cert Creative Pattern Cutting (15 weeks)":
                "School of Art & Design",
                "Art and Design Professional Doctorate":
                "School of Art & Design",
                "Art and Design":
                "School of Art & Design",
                "Broadcast Journalism":
                "School of Arts and Humanities",
                "Digital and Newspaper Journalism":
                "School of Arts and Humanities",
                "Magazine Journalism":
                "School of Arts and Humanities",
                "Documentary Journalism":
                "School of Arts and Humanities",
                "Media and Globalisation":
                "School of Arts and Humanities",
                "Creative Writing":
                "School of Arts and Humanities",
                "English Literary Research":
                "School of Arts and Humanities",
                "Linguistics":
                "School of Arts and Humanities",
                "Philosophy":
                "School of Arts and Humanities",
                "History":
                "School of Arts and Humanities",
                "PGCert Museum and Heritage Development":
                "School of Arts and Humanities",
                "Holocaust and Genocide":
                "School of Arts and Humanities",
                "International Development":
                "School of Arts and Humanities",
                "English Language Teaching":
                "School of Arts and Humanities",
                "TESOL (Teaching English to Speakers of Other Languages)":
                "School of Arts and Humanities",
                "Management":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Management and Innovation and Enterprise":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Digital Marketing":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "fees, funding and scholarships":
                "Nottingham Business School",
                "Return to all courses":
                "Nottingham Business School",
                "Human resource Management":
                "Nottingham Business School",
                "Economics":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "International Business (Dual Award) ":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and International Publishing":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "Finance and Accounting":
                "Nottingham Business School",
                "Finance and Investment Banking":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "Entrepreneurship":
                "Nottingham Business School",
                "Project Management":
                "Nottingham Business School",
                "Management":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "Assessment Only Route to QTS (Primary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Assessment Only Route to QTS (Secondary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training (Assessment Only) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Education":
                "Nottingham Institute of Education",
                "English Language Teaching":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with English and Literacy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Mathematics and Numeracy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Science, Engineering and Technology)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Special and Inclusive Practice)":
                "Nottingham Institute of Education",
                "Primary Education":
                "Nottingham Institute of Education",
                "Primary: School-Centred Initial Teacher Training (SCITT)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary)":
                "Nottingham Institute of Education",
                "Secondary Biology":
                "Nottingham Institute of Education",
                "Secondary Business Education":
                "Nottingham Institute of Education",
                "Secondary Chemistry":
                "Nottingham Institute of Education",
                "Secondary Computer Science with ICT":
                "Nottingham Institute of Education",
                "Secondary Education (Design and Technology)":
                "Nottingham Institute of Education",
                "Secondary Education (Physics)":
                "Nottingham Institute of Education",
                "Secondary English":
                "Nottingham Institute of Education",
                "Secondary Mathematics":
                "Nottingham Institute of Education",
                "Secondary Music":
                "Nottingham Institute of Education",
                "Special Educational Needs Coordination - National Award":
                "Nottingham Institute of Education",
                "Teaching English to Speakers of Other Languages (TESOL)":
                "Nottingham Institute of Education",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "Dual LLM in Corporate and Insolvency Law / European and Insolvency Law":
                "Nottingham Law School",
                "General Law":
                "Nottingham Law School",
                "Health Law and Ethics":
                "Nottingham Law School",
                "Human Rights and Justice":
                "Nottingham Law School",
                "Intellectual Property Law":
                "Nottingham Law School",
                "International Financial Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Sports Law":
                "Nottingham Law School",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Legal Practice":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Biomedical Science":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Molecular Cell Biology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Cancer Biology":
                "School of Science and Technology",
                "Cell Biology":
                "School of Science and Technology",
                "Molecular Biology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Chemistry / Chemistry (Professional Practice)":
                "School of Science and Technology",
                "Pharmaceutical and Medicinal Science":
                "School of Science and Technology",
                "Pharmaceutical Analysis":
                "School of Science and Technology",
                "Analytical Chemistry":
                "School of Science and Technology",
                "Chemistry":
                "School of Science and Technology",
                "Advanced Materials Engineering":
                "School of Science and Technology",
                "Forensic Science":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Cloud and Enterprise Computing":
                "School of Science and Technology",
                "IT Security":
                "School of Science and Technology",
                "Engineering (Electronics)":
                "School of Science and Technology",
                "Engineering (Cybernetics and Communications)":
                "School of Science and Technology",
                "Engineering Management":
                "School of Science and Technology",
                "Computing Systems":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Electronic Systems":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Mathematical Sciences":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Medical and Materials Imaging":
                "School of Science and Technology",
                "Medical Imaging":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Sport Science":
                "School of Science and Technology",
                "Exercise Physiology":
                "School of Science and Technology",
                "Performance Nutrition":
                "School of Science and Technology",
                "Performance Analysis":
                "School of Science and Technology",
                "Biomechanics":
                "School of Science and Technology",
                "Sport and Exercise Psychology":
                "School of Science and Technology",
                "Psychology":
                "School of Social Sciences",
                "Applied Child Psychology":
                "School of Social Sciences",
                "sychological Research Methods":
                "School of Social Sciences",
                "Forensic Mental Health":
                "School of Social Sciences",
                "Forensic Psychology (BPS accredited)":
                "School of Social Sciences",
                "Cyberpsychology":
                "School of Social Sciences",
                "Psychology in Clinical Practice":
                "School of Social Sciences",
                "Psychological Wellbeing and Mental Health":
                "School of Social Sciences",
                "Criminology":
                "School of Social Sciences",
                "Sociology":
                "School of Social Sciences",
                "Politics":
                "School of Social Sciences",
                "International Relations":
                "School of Social Sciences",
                "Online International Relations (Distance learning)":
                "School of Social Sciences",
                "Public Health":
                "School of Social Sciences",
                "Career Development":
                "School of Social Sciences",
                "Social Work (January 2019 entry)":
                "School of Social Sciences",
            }
            item['department'] = departmentDict.get(item['programme_en'])
            if item['department'] == None:
                item['department'] = departmentDict.get(item['programme_en'])
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
                    if item['department'] == None:
                        item['department'] = departmentDict.get(
                            item['programme_en'].replace(" ", " "))
            print("item['department'] = ", item['department'])

            # School of Animal, Rural and Environmental Sciences
            # School of Architecture, Design and the Built Environment
            # School of Art &amp; Design
            # School of Arts and Humanities
            # Nottingham Business School
            # Nottingham Institute of Education
            # Nottingham Law School
            # School of Science and Technology
            # School of Social Sciences
            if item['department'] is None:
                if "/animal-rural-environmental-sciences" in item['url']:
                    item[
                        'department'] = "School of Animal, Rural and Environmental Sciences"
                elif "/architecture-design-built-environment" in item['url']:
                    item[
                        'department'] = "School of Architecture, Design and the Built Environment"
                elif "/art-design" in item['url']:
                    item['department'] = "School of Art & Design"
                elif "/arts-humanities" in item['url']:
                    item['department'] = "School of Arts and Humanities"
                elif "/business" in item['url']:
                    item['department'] = "Nottingham Business School"
                elif "/education" in item['url']:
                    item['department'] = "Nottingham Institute of Education"
                elif "/law" in item['url']:
                    item['department'] = "Nottingham Law School"
                elif "/science-technology" in item['url']:
                    item['department'] = "School of Science and Technology"
                elif "/social-sciences" in item['url']:
                    item['department'] = "School of Social Sciences"
            print("item['department']1 = ", item['department'])

            if item['degree_name'] == "BA (Hons)":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            elif item['department'] == "School of Art & Design" or item[
                    'department'] == "School of Animal, Rural and Environmental Sciences" or item[
                        'department'] == "School of Science and Technology":
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "Nottingham Business School":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "School of Architecture, Design and the Built Environment" or item[
                    'department'] == "School of Arts and Humanities" or item[
                        'department'] == "Nottingham Institute of Education" or item[
                            'department'] == "Nottingham Law School" or item[
                                'department'] == "School of Social Sciences" or item[
                                    'department'] == "School of Art & Design":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("item['IELTS'] = %s item['IELTS_L'] = %s item['IELTS_S'] = %s item['IELTS_R'] = %s item['IELTS_W'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-1']//text()").extract()
            entry_requirementsStr = ''.join(entry_requirements)
            ielts = re.findall(r"IELTS.{1,200}", entry_requirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts'] == None:
                ieltsDict = get_ielts(''.join(ielts))
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
                        <table id="table76765" style="width: 100%;"><thead><tr><th id="table76765r1c1"> Your qualification</th><th id="table76765r1c2"> You could study</th></tr></thead><tbody><tr><td headers="table76765r1c1">         High School Year 2<br />Grades of 70% and above       </td><td headers="table76765r1c2">         Foundation courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College (NTIC) </a></td></tr><tr><td headers="table76765r1c1">         High School Year 3<br />Grades of 80% and above       </td><td headers="table76765r1c2">         International Year One courses at NTIC       </td></tr><tr><td headers="table76765r1c1">         Completion of first year of Chinese university degree       </td><td headers="table76765r1c2">         First year bachelors degrees       </td></tr><tr><td headers="table76765r1c1">         Three year diploma or higher national diploma       </td><td headers="table76765r1c2">         Considered for final year entry to selected bachelors degrees or for Pre-Masters courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College</a></td></tr><tr><td headers="table76765r1c1">         Bachelors degree (four years or six years in medicine / dentistry) from recognised institution in China. <br />Grades of 75% or above<br />Grades of 70% or above from 211 universities       </td><td headers="table76765r1c2">         Postgraduate (Masters) courses       </td></tr><tr><td headers="table76765r1c1">         Masters degree from a recognised institution in China.<br />Grades of 70% or above       </td><td headers="table76765r1c2">         Postgraduate research       </td></tr></tbody></table><p>If you have questions about your qualification and it is not listed here, please <a href="mailto:[email protected]">contact us</a> for advice.</p>
"""
                ]))

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code(s):')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace(" / ",
                                                                "/").strip()
            print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//strong[contains(text(),'Course duration:')]/following-sibling::span//text()"
            ).extract()
            print("duration: ", duration)
            duration_str = ''.join(duration).replace("/ sandwich", "").strip()
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            duration_per = item['duration_per']
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            if "/" in item['ucascode']:
                ucascode_sp = item['ucascode'].split("/")
                if "/" in duration_str:
                    duration_sp = duration_str.split("/")
                elif " or" in duration_str:
                    duration_sp = duration_str.split(" or")
                elif "," in duration_str:
                    duration_sp = duration_str.split(" or")
                else:
                    duration_sp = [duration_str, duration_str]
                print("ucascode_sp: ", ucascode_sp)
                print("duration_sp: ", duration_sp)
                if len(ucascode_sp) == 2:
                    item['ucascode'] = ucascode_sp[0]
                    duration_list = getIntDuration(duration_sp[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[0].strip())
                        item['duration_per'] = duration_per
                    print("item['ucascode']1: ", item['ucascode'])
                    print("item['duration']1 = ", item['duration'])
                    print("item['duration_per']1 = ", item['duration_per'])
                    yield item

                    item['ucascode'] = ucascode_sp[-1]
                    duration_list = getIntDuration(duration_sp[-1].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[-1].replace(
                            "year", "").replace("(s)", "").strip())
                        item['duration_per'] = 1
                    print("item['ucascode']2: ", item['ucascode'])
                    print("item['duration']2 = ", item['duration'])
                    print("item['duration_per']2 = ", item['duration_per'])
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 29

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath("//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//nobr[contains(text(),'UCAS Code')]/../following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS Code:",
                                                         "").strip()
            # print("item['ucascode'] = ", item['ucascode'])

            item['start_date'] = response.meta.get(response.url)
            # print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            # department = response.xpath(
            #     "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    ",", "").replace("£", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),"What you'll experience")]/..|
            //h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/../.."""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)).replace(
                    "<h3>What you'll experience</h3>", "").strip()
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            ieltsList = response.xpath(
                "//*[contains(text(),'English language proficiency')]/text()|"
                "//*[contains(text(),'English Language proficiency')]/text()"
            ).extract()
            # print(ieltsList)
            if len(ieltsList) == 0:
                ieltsList = re.findall(r".{1,45}IELTS.{1,85}",
                                       rntry_requirements_str)
            clear_space(ieltsList)
            if len(ieltsList) > 0:
                item['ielts_desc'] = ''.join(ieltsList[1:]).strip()
                if item['ielts_desc'] == "":
                    item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]/text()").extract()
            # print(ieltsList)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            apply_proces_en = response.xpath(
                "//h2[@id='Apply']/..|//h2[@id='Apply']/../following-sibling::*"
            ).extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2 style="color: #384047; margin: 33px 0px 0.7em; padding: 0px;">What you'll need to send us</h2>
<p style="color: #384047; margin: 0px 0px 25px; border: none;">When you apply to join us, we'll need to see the following documents:</p>
<ul style="color: #384047; margin: 0px 0px 25px; padding-left: 35px; border: none; list-style-image: initial;">
    <li style="margin-top: 0px;">A completed application form</li>
    <li>A Personal Statement or Statement of Purpose</li>
    <li>Officially certified and translated copies of your high school or college qualification and grades (for undergraduate courses)</li>
    <li>Officially certified and translated copies of your degree qualification and grades (for Postgraduate courses)</li>
    <li>Proof of your English language level (such as an IELTS Certificate)</li>
    <li style="margin-bottom: 0px;">One academic reference on official headed paper for undergraduate courses or two references for postgraduate courses</li>
</ul>"""
                ]))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Undergraduate courses</h3>
<p>If you've completed the Chinese Senior High School Diploma plus one year at a recognised university in China, we'll consider you for admission onto an undergraduate course such as a Bachelor's degree. You must have studied relevant subjects and achieved strong grades.</p>
<p>If you don't have a Chinese Senior High School Diploma, you can apply with:</p>
<h4>A levels</h4>
<ul>
    <li>Most courses will require 120 UCAS points. Your A level grades should equal or exceed the total points required. You can use the&nbsp;<a rel="noopener noreferrer" rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator"></a><a rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator" target="_blank">UCAS Tariff Calculator</a>&nbsp;to work out your total points. Please check your specific course page to find the exact number of points.</li>
    <li>Some courses will require you to have studied specific subjects at A level. For example, to study a science course you will usually need to have achieved passing grades in scientific subjects at A level.</li>
    <li>A level points: A* = 56 A = 48 B = 40 C = 32 D = 24.</li>
</ul>
<h4>International Baccalaureate</h4>
<ul>
    <li>Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for.</li>
</ul>
<p>You may also be considered for advanced entry onto a relevant undergraduate degree programme if you have a College Graduation Diploma (Dazhuan) from a recognised university or college on completion of two to three years of study, or a BTEC HND or SQA HND Higher National Diploma in a relevant subject.</p>
<p>You may be able to join an undergraduate course with other qualifications. We do consider qualifications from a range of sources. Contact us to find out more.</p>"""
                ]))
            item[
                "ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: AberystwythUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "Aberystwyth University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Aberystwyth University, Reception, Penglais, Aberystwyth, Ceredigion, SY23 3FL"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath("//div[@class='banner__caption banner__caption--below']//text()").extract()
            department = ''.join(department).replace("in the ", "").strip()
            item['department'] = department
            # print("item['department']: ", item['department'])

            # 专业、学位类型
            degree_name = response.xpath("//div[@class='hero-header']//header/span//text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme_en = response.xpath("//div[@class='hero-header']//header/h1//text()").extract()
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # if item['degree_name'] == "":
            #     print("*****111****")

            duration = response.xpath(
                "//h3[contains(text(),'Course Length')]/following-sibling::p//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='overview']
            overview = response.xpath("//div[@class='key-facts']/following-sibling::p|"
                                      "//h3[@id='course-overview']|//h3[@id='course-overview']/following-sibling::div[1]|"
                                      "//h3[@id='coursedetails']|//h3[@id='coursedetails']/following-sibling::div[1]").extract()
            if len(overview) == 0:
                overview = response.xpath("//h2[contains(text(),'Overview')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])
            # if item['overview_en'] == "":
            #     print("*****111****")

            modules = response.xpath("//h3[@id='coursecontent']|//h3[@id='coursecontent']/following-sibling::div[1]").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])
            # if item['modules_en'] == "":
            #     print("*****111****")

            career_en = response.xpath("//h3[@id='employability']|//h3[@id='employability']/following-sibling::div[1]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])
            # if item['career_en'] == "":
                # print("*****111****")

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching & Learning')]|//h3[contains(text(),'Teaching & Learning')]/following-sibling::div[1]").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])
            # if item['assessment_en'] == "":
            #     print("*****111****")

            alevel = response.xpath(
                "//h3[contains(text(),'Typical A-level offer')]/following-sibling::p//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])


            #     item['tuition_fee'] = int(feelist[0].replace('£', '').replace(',', '').strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = response.xpath("//strong[contains(text(),'English language requirements')]/..//text()").extract()
            item['ielts_desc'] = ''.join(ielts).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            ucascode = response.xpath("//span[contains(@title,'UCAS Code')]/em//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # 学费链接
            item['other'] = "https://www.aber.ac.uk/en/international/fees-scholarships/fees-money/int-under/"
            item['apply_proces_en'] = "https://www.aber.ac.uk/en/undergrad/apply/?course=W402"
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)