def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='Leeds Beckett University' item['url']=response.url location=response.xpath('//div[contains(text(),"Location")]/following-sibling::span/text()').extract() location=set(location) location=''.join(location).strip() # print(location) item['location']='Leeds' degree_name=response.xpath('//div[@class="course-hero__label"]/text()').extract() degree_name=''.join(degree_name).strip() item['degree_name']=degree_name programme=response.xpath('//h1[@class="course-hero__title"]/text()').extract() programme=''.join(programme).strip() # print(programme) # print(degree_name) item['programme_en']=programme department=response.xpath('//div[@class="course-hero__labels"]/a/text()').extract() department=''.join(department) # print(department) item['department']=department mode=response.xpath('//div[contains(text(),"Attendance")]/following-sibling::div//text()').extract() mode=''.join(mode) mode=re.findall('(?i)full',mode) if mode!=[]: item['teach_time']='1' else: item['teach_time']='2' start_date=response.xpath('//div[contains(text(),"Start Date")]/following-sibling::div//text()').extract() start_date=tracslateDate(start_date) start_date=set(start_date) # print(start_date) start_date=','.join(start_date) item['start_date']=start_date duration=response.xpath('//div[contains(text(),"Duration")]/following-sibling::span//text()').extract() duration=clear_duration(duration) # print(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] overview=response.xpath('//h2[contains(text(),"Overview")]/../following-sibling::div').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview rntry=response.xpath('//h2[contains(text(),"Entry Requirements")]/../following-sibling::div').extract() rntry=remove_class(rntry) item['rntry_requirements']=rntry IELTS=response.xpath('//div[@class="entry-ielts"]/text()').extract() ielts=get_ielts(IELTS) # print(ielts) try: if ielts!=[] or ielts!={}: item['ielts_l']=ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass career=response.xpath('//h3[contains(text(),"Careers")]/following-sibling::div').extract() career=remove_class(career) item['career_en']=career modules=response.xpath('//div[@class="course-modules__table-modules"]//div[@class="course-modules__dropdowns"]').extract() modules=remove_class(modules) # print(modules) item['modules_en']=modules fee=response.xpath('//div[contains(text(),"£")]/text()').extract() fee=''.join(fee).strip() fee=re.findall('£\d{3,}',fee) fee = '-'.join(fee).replace(',', '').replace('£', '') fee = fee.split('-') try: fee = list(map(int, fee)) fee = max(fee) item['tuition_fee']=fee except: pass item['tuition_fee_pre']='£' apply_d=["Academic Certificates.", "Evidence of your English language ability (see below).", "A photocopy of your passport.", "A reference to support your application – either academic or professional.", "A completed Agent Consent Form (required if you are applying via or with the help of an agent).",] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d apply_p=["Applying for a postgraduate course", "Once you have found the course you want to study in our online prospectus you will then click on the ‘Apply Now’ button located at the top of the online course page. ", "You will be asked to create an account on our application portal and complete your application via your Leeds Beckett account. Once you have submitted your application you should receive a decision within six weeks of applying. The exception to this is if the course you have applied for has a closing date specified. In this case, we will wait until the closing date has passed before we contact you",] apply_p='\n'.join(apply_p) item['apply_proces_en']=apply_p # print(item) yield item
def parse(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Ulster University' item['url'] = response.url item['location'] = 'Belfast' item['teach_time'] = '1' programme = response.xpath('//h1//text()').extract() programme = ''.join(programme).strip() # print(programme) degr = re.findall('-.+', programme) degr = ''.join(degr) # print(degr) programme = programme.replace(degr, '').replace('*', '').strip() degr = degr.replace('-', '').strip() # print(degr) # print(programme) item['programme_en'] = programme item['degree_name'] = degr try: if degr[0] == 'M': item['degree_type'] = '2' elif degr[0] == 'P': item['degree_type'] = '3' except: pass overview = response.xpath( '//h2[contains(text(),"Overview")]/following-sibling::*').extract( ) overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="modules"]').extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules rntry = response.xpath('//div[@id="entryconditions"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath('//div[@id="opportunities"]').extract() career = remove_class(career) item['career_en'] = career start_date = response.xpath( '//h3[contains(text(),"Start dates")]/following-sibling::*//text()' ).extract() start_date = tracslateDate(start_date) start_date = set(start_date) # print(start_date) start_date = '.'.join(start_date).strip() item['start_date'] = start_date # item['deadline']='2019-6' ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass fee = response.xpath( '//dt[contains(text(),"International:")]/following-sibling::dd/text()' ).extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' # print(item) yield item
def parses(self, response): print(response.url) # print('收到了') item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University College London' item['url'] = response.url item['tuition_fee_pre'] = '£' location = response.xpath( '//div/strong[contains(text(),"Location")]/../text()').extract() location = ''.join(location).strip() item['location'] = location programme = response.xpath('//h1[@class="heading"]//text()').extract() programme = ''.join(programme) # print(programme) degree_name = re.findall('[MB][A-Z]{1,2}[a-z]*', programme) # print(degree_name) degree_name = ''.join(set(degree_name)).strip() programme = programme.replace(degree_name, '') item['programme_en'] = programme item['degree_name'] = degree_name item['degree_type'] = '2' # print(programme) mode = response.xpath('//*[contains(text(),"FT")]//text()').extract() if mode != []: item['teach_time'] = 1 else: item['teach_time'] = 2 # department=response.meta['department'] # department=''.join(department).strip() # # print(department) # item['department'] = department department = response.xpath( '//h5[contains(text(),"Department website")]/following-sibling::p/a/text()' ).extract() department = ''.join(department).strip() # # print(department) item['department'] = department overview = response.xpath( '//article[@class="article"]/h1/following-sibling::article/p[1]' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview application_open_date = response.xpath( '//div[contains(text(),"Open")]/text()').extract() application_open_date = tracslateDate(application_open_date) # print(application_open_date) application_open_date = ','.join(set(application_open_date)) item['application_open_date'] = application_open_date deadline = response.xpath( '//div[contains(text(),"Close")]/text()').extract() deadline = tracslateDate(deadline) deadline = ','.join(set(deadline)) item['deadline'] = deadline tuition_fee = getTuition_fee( response.xpath('//*[contains(text(),"£")]//text()').extract()) item['tuition_fee'] = tuition_fee duration = response.xpath( '//h4[contains(text(),"uration")]/following-sibling::div/text()' ).extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//h4[contains(text(),"tarts")]/following-sibling::p//text()' ).extract() # print(start_date) start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(set(start_date)) # print(start_date) item['start_date'] = start_date item['apply_fee'] = '75' item['apply_pre'] = '£' eng_level = response.xpath( '//p[contains(text(),"English language")]/strong/text()').extract( ) eng_level = ''.join(eng_level).strip() if eng_level == 'Standard': ielts = 'Overall grade of 6.5 with a minimum of 6.0 in each of the subtests.' toefl = 'Overall score of 92 with 24/30 in reading and writing and 20/30 in speaking and listening.' elif eng_level == 'Good': ielts = 'Overall grade of 7.0 with a minimum of 6.5 in each of the subtests.' toefl = 'Overall score of 100 with 24/30 in reading and writing and 20/30 in speaking and listening.' elif eng_level == 'Advanced': ielts = 'Overall grade of 7.5 with a minimum of 6.5 in each of the subtests.' toefl = 'Overall score of 109 with 24/30 in reading and writing and 20/30 in speaking and listening.' else: ielts = '' toefl = '' ieltss = get_ielts(ielts) # print(ieltss) if ieltss != {} and ieltss != []: # ieltss=list(map(float,ieltss)) item['ielts_l'] = ieltss['IELTS_L'] item['ielts_s'] = ieltss['IELTS_S'] item['ielts_r'] = ieltss['IELTS_R'] item['ielts_w'] = ieltss['IELTS_W'] item['ielts'] = ieltss['IELTS'] toefls = re.findall('\d{1,3}', ''.join(toefl)) # print(toefls) if len(toefls) == 5: item['toefl'] = toefls[0] item['toefl_l'] = toefls[4] item['toefl_w'] = toefls[2] item['toefl_r'] = toefls[1] item['toefl_s'] = toefls[3] elif len(toefls) == 2: toefls = list(map(int, toefls)) item['toefl'] = max(toefls) item['toefl_l'] = min(toefls) item['toefl_w'] = min(toefls) item['toefl_r'] = min(toefls) item['toefl_s'] = min(toefls) item['ielts_desc'] = ielts item['toefl_desc'] = toefl # print(item) rntry_requirements = response.xpath( '//h4[contains(text(),"ntry")]/following-sibling::p[1]').extract() rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) item['rntry_requirements'] = rntry_requirements chinese_reuqirement = [ "<div>Equivalent qualifications for China", "Bachelor's degree with a minimum overall average mark of 80%. Please note that a number of programmes / departments will require higher marks.", "ALTERNATIVE QUALIFICATIONS", "Medical/ Dental/ Master's degree; Doctorate.</div>", ] chinese_reuqirement = '\n'.join(chinese_reuqirement) item['require_chinese_en'] = chinese_reuqirement modules = response.xpath( '//h2[contains(text(),"About this")]/following-sibling::div' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::div').extract( ) career = remove_class(career) item['career_en'] = career yield item
def parse_main(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University of Leicester' item['url'] = response.url item['tuition_fee_pre'] = '£' department = response.xpath( '//dt[contains(text(),"Department")]/following-sibling::dd/text()' ).extract() department = ''.join(department).strip() # print(department) item['department'] = department overview = response.xpath( '//h2[contains(text(),"Course description")]/following-sibling::*' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview chinese_require = [ "<p>", "If you have completed a four-year Bachelors degree in China, you can be considered for entry to a Masters degree at Leicester. Our requirements depend on the rank of the university from which you graduated and your chosen Masters degree. The following is intended as a guide to our requirements:</p>", "<p>If you have graduated from a 'top 200' university in China, you may be asked for 70% overall if you are applying for an Engineering or Science degree, or 75% for an Arts, Humanities, Law or Social Science degree. You may need to have scores of at least 80% in modules that are particularly relevant to your chosen Master’s degree. The School of Museum Studies requires at least 80% overall.</p>", "<p>If you graduated from a Chinese university ranked below the top 200 you may require higher scores (80-85%).</p>", "<p>If you have completed a three-year college diploma from a Chinese university, you will need to take an accepted one-year Pre-Masters course or upgrade your diploma to a Bachelor’s degree before applying for a Master’s degree.</p>", ] chinese_require = remove_class(chinese_require) item['require_chinese_en'] = chinese_require rntry = response.xpath( '//h2[contains(text(),"Entry requirements")]/following-sibling::*' ).extract() rntry = remove_class(rntry).replace( 'International Qualifications', '' ).replace('Countries list', '').replace( 'Find your country in this list to check equivalent qualifications, scholarships and additional requirements.', '') # print(rntry) item['rntry_requirements'] = rntry fee = response.xpath( '//h3[contains(text(),"International Students")]/following-sibling::*//text()' ).extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee career = response.xpath('//div[@id="careers"]').extract() career = remove_class(career) # print(career) item['career_en'] = career modules = response.xpath('//div[@id="course-structure"]').extract() modules = remove_class(modules) item['modules_en'] = modules assessment = response.xpath( '//h2[contains(text(),"Teaching and learning")]/following-sibling::div' ).extract() assessment = remove_class(assessment) item['assessment_en'] = assessment ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) if ielts != []: item['ielts'] = ielts['IELTS'] item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] if item['ielts'] == 6.0: item['toefl'] = 80 elif item['ielts'] == 6.5: item['toefl'] = 90 elif item['ielts'] == 7.0: item['toefl'] = 100 if item['toefl'] != None: item['toefl_l'] = '17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' programme = response.xpath( '//span[contains(text(),"Course")]/following-sibling::span/text()' ).extract() # print(programme) degree_name = response.xpath( '//span[contains(text(),"Qualification")]/following-sibling::span/text()' ).extract() # print(degree_name) duration = response.xpath( '//span[contains(text(),"Duration")]/following-sibling::span/text()' ).extract() # print(duration) start_date = response.xpath( '//span[contains(text(),"Start Dates")]/following-sibling::span/text()' ).extract() # print(start_date) if start_date == []: start_date = ['', '', '', ''] for pro, deg, dur, sta in zip(programme, degree_name, duration, start_date): item['programme_en'] = pro item['degree_name'] = deg dura = clear_duration(dur) item['duration'] = dura['duration'] item['duration_per'] = dura['duration_per'] sta = tracslateDate(sta) sta = ','.join(sta) item['start_date'] = sta mode = re.findall('(?i)full', dur) if mode != []: item['teach_time'] = 'fulltime' if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE': # print(item) yield item else: item['teach_time'] = 'parttime' if deg != 'PGDip' and deg != 'PGCert' and deg != 'PGCE': # print(item) yield item
def parses(self, response): # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) Duration = response.xpath( '//h4[contains(text(),"Duration")]/following-sibling::p[1]//text()' ).extract() Duration = ''.join(Duration) if '1' in Duration: item['duration'] = 1 item['duration_per'] = 1 if 'full' in Duration: item['teach_time'] = 'fulltime' else: item['teach_time'] = 'parttime' StartDate = response.xpath( '//*[contains(text(),"Start")]/../text()').extract() try: StartDate = tracslateDate(StartDate) StartDate = ','.join(StartDate) item["start_date"] = StartDate except: pass Course = response.url.split('/')[-1] Course = Course.replace('-', ' ').title() EntryRequirements = response.xpath( '//div[@id="entry-requirements"]').extract() EntryRequirements = remove_class(EntryRequirements) EntryRequirements = clear_same_s(EntryRequirements) CourseOverview = response.xpath('//div[@id="overview"]').extract() CourseOverview = remove_class(CourseOverview) CourseOverview = clear_same_s(CourseOverview) Career = response.xpath('//div[@id="careers"]').extract() # if Career==[]: # print(response.url) Career = remove_class(Career) Career = clear_same_s(Career) Assessment = response.xpath('//div[@id="teaching"]').extract() if Assessment == []: print(response.url) Assessment = remove_class(Assessment) Master = response.xpath( '//div[@class="page-heading"]/h2/text()').extract() Master = ''.join(Master) university = 'Harper Adams University' item['ielts'] = '6.0' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['toefl_r'] = '18' item['toefl_l'] = '18' item['toefl_s'] = '22' item['toefl_w'] = '20' item['toefl'] = '80' item["university"] = university item["programme_en"] = Course item["degree_name"] = Master item["overview_en"] = CourseOverview item["assessment_en"] = Assessment item["career_en"] = Career item["tuition_fee"] = '12650' item['tuition_fee_pre'] = '£' item['apply_proces_en'] = remove_class([ "<div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>Applying for university as an international student is similar to the process that UK students follow, but there are a few extra steps.</p>", " ", " <p>These include:</p>", " <ul>", " <li>Taking an <a>English language test</a></li>", " <li>Applying for a <a>visa</a></li>", " <li>Attending a pre-sessional course</li>", " </ul>", " <p>To understand the general steps for applying to one of courses, take a look at our <a>How to apply</a> pages.</p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " ", " </div>", " </div>", " </div>", " <div>", " ", " <div>", " <div>", " <div>", " <div>Before you apply</div>", " </div>", " <div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>To study on a course at Harper Adams, you'll need to meet the entry requirements listed on the <a>English language requirements</a> and you may need to take an English language test.</p>", " <p>Like UK students, if you're applying for one of our undergraduate courses, you'll need to apply through the <a>complete an application form</a>.</p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " <div>", " <div>", " <div>After you apply</div>", " </div>", " <div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>We'll look at your application and decide if you meet the entry requirements. We may ask to interview you. We'll keep you updated about the status of your application by email or post.</p>", " <p>If we accept your application, we'll send you either an unconditional or conditional offer. Unconditional offers mean you have been accepted to study on a course without any other requirements. A conditional offer means you'll have to give us some additional information or prove a qualification.</p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " <div>", " <div>", " <div>Before you arrive</div>", " </div>", " <div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>Depending on where you're coming from, you'll need to <a>visa pages</a> to find out more.</p>", " <p>As part of the visa application process, you may need to submit a Confirmation of Acceptance for Studies (CAS) number or a similar letter that says you've been accepted to study here.</p>", " <h3>Confirmation of Acceptance for Studies (CAS) number</h3>", " <p>If you meet all of the conditions of your offer by the deadline printed on your offer letter, we'll give you a Confirmation of Acceptance for Studies (CAS) number. You'll need your CAS number to apply for your visa.</p>", " <p>Your CAS number is unique to you and your place at Harper Adams. It can't be transferred to any other university. If you decide to withdraw your application, you must let us know so we can cancel your CAS number.</p>", " <h3>Short-term study visa letters</h3>", " <p>If you're applying for a course that requires a <a>short-term study visa</a>, and you've met any offer conditions we've set, we'll give you a letter that confirms we've accepted you. You'll need to submit this with your visa application. You may also need to show it when you enter the UK.</p>", " <h3>Applying for accommodation</h3>", " <p>You'll need to apply for <a>accommodation</a> before you arrive in the UK. We'll send you details of how to do this along with your offer letter. You'll need to tell the university in advance if you're bringing family to live with you.</p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " <div>", " <div>", " <div>When you arrive</div>", " </div>", " <div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>We'll let you know the date that you need to arrive by in your offer letter. You'll need to make arrangements to travel to the UK and get to Harper Adams by this date.</p>", " <p>When you first arrive in the UK, you'll need to go through immigration controls. To help you get through immigration as quickly and easily as possible, you should:</p>", " <ul>", " <li>Not arrive before the start date of your visa</li>", " <li>Make sure you've filled in a landing card (if required) and included details of a UK contact - this can be the university's address or the address of a landlord</li>", " <li>Have your passport, CAS or offer letter, details of where you'll stay and proof that you have enough money to study here ready to show immigration officers</li>", " <li>Make sure you know the conditions of your visa, when it expires, and the number of hours you are allowed to work</li>", " <li>Declare any sums of cash over €10,000 (or equivalent in your currency).</li>", " </ul>", " <p>To avoid any issues at immigration, you should not:</p>", " <ul>", " <li>Bring food or drink (such as meat, dairy products, fish, eggs, honey, fruit, vegetables or plants) with you.</li>", " <li>Bring counterfeit goods, firearms, weapons or indecent/obscene material with you.</li>", " </ul>", " ", " <p>More information on travelling through the UK border can be found at <a>www.gov.uk/government/publications/coming-to-the-uk/faster-travel-through-the-uk-border</a></p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " <div>", " <div>", " <div>After you arrive</div>", " </div>", " <div>", " <div>", " <div>", " <div>", " <div>", " <div>", " ", " <p>On your first day at Harper Adams, you'll need to bring your passport and visa (as well as any certificates or documents we've requested) so we can make a copy for our reference.</p>", " </div>", " </div><div>", " <div> ", " ", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", " </div>", ]) item["rntry_requirements"] = EntryRequirements item["url"] = response.url item['location'] = 'Edgmond' modu = response.xpath( '//div[@class="tabmenu"]/ul/li/a/@onclick').extract() mod = response.xpath( '//div[@class="tabmenu"]/ul/li/a/@title').extract() print(mod) print(modu) modules = [] for i, j in zip(mod, modu): if 'M' in i: print('要这个专业的课程') print(i) id = re.findall('\d+', j) fullurl = 'https://www.harper-adams.ac.uk/shared/get-pg-route-modules.cfm?id=' + str( id[0]) + '&year_of_entry=' + str(id[1]) + '&route=' + str( id[2]) print(fullurl) modre = etree.HTML(requests.get(fullurl).content).xpath( '//div[@class="content-section-inner"]') ma = '' for mas in modre: ma += etree.tostring(mas, method='html', encoding='unicode') # parMod=remove_class(ma) modules += ma # print(id) else: modules = '' # print(modules) item['modules_en'] = remove_class(modules) # print(item) yield item
def parses(self, response): # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Staffordshire University' item['url'] = response.url item['location'] = 'Staffordshire' programme = response.xpath('//h1/text()').extract() programme = ''.join(programme).strip() degree_name = response.xpath( '//h2[@class="hero_header text-center"]/text()').extract() if degree_name == []: degree_name = re.findall('[A-Z]{2,}[a-z]*', programme) degree_name = ''.join(degree_name).strip() item['degree_name'] = degree_name else: item['degree_name'] = ''.join(degree_name).strip() item['programme_en'] = programme programme = response.xpath( '//div[@class="col-sm-9"]/h1/text()|//div[@id="main"]//h1/text()' ).extract() programme = ''.join(programme).strip() degree = re.findall('[A-Z]{2}[/a-zA-Z\s]*', programme) programme = programme.replace(''.join(degree), '').strip() if degree == []: degree = response.xpath( '//h2[@class="hero_header text-center"]/text()').extract() elif degree != []: degree = ''.join(degree) else: degree = '' item['degree_name'] = ''.join(degree).strip() item['programme_en'] = programme duration = response.xpath( '//th[contains(text(),"Duration")]/following-sibling::td/text()|//dt[contains(text(),"Duration")]/following-sibling::dd[1]/text()' ).extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//dt[contains(text(),"Academic year:")]/following-sibling::dd/text()' ).extract() if start_date == []: start_date = response.xpath( '//th[contains(text(),"Course start")]/following-sibling::td/text()' ).extract() start_date = tracslateDate(start_date) item['start_date'] = ','.join(start_date).strip() department = response.xpath( '//th[contains(text(),"School")]/following-sibling::td/text()' ).extract() department = ''.join(department).strip() item['department'] = department fee = response.xpath('//*[contains(text(),"£")]//text()').extract() tuition_fee = getTuition_fee(fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' overview = response.xpath( '//div[@id="key-features"]|' '//section[@class="course-details_section summary-section"]//div[@class="medium-8 medium-pull-4 large-pull-3 column"]' ).extract() overview = remove_class(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="course-content"]|//section[@id="contents"]|//div[@id="course-summary"]' ).extract() modules = remove_class(modules) item['modules_en'] = modules rntry = response.xpath( '//div[@id="course-entry-requirements"]|//section[@id="entry"]' ).extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath( '//div[@id="graduate-destinations"]|//section[@id="careers"]' ).extract() career = remove_class(career) item['career_en'] = career ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = ''.join(ielts).strip() item['ielts_desc'] = ielts ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass assessment = response.xpath( '//a[contains(text(),"ssessment")]/../following-sibling::div[1]' ).extract() item['assessment_en'] = remove_class(assessment) yield item
def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='Anglia Ruskin University' item['url']=response.url item['teach_time']='1' programme=response.xpath('//h1/text()').extract() programme=''.join(programme).split('\r\n') if len(programme)==4: prog=programme[1].strip() degr=programme[2].strip() item['degree_name'] = degr else: prog=''.join(programme) item['programme_en']=prog location=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__locations"]/a/text()').extract() location=set(location) # print(location) location=','.join(location) item['location']=location start_date=response.xpath('//div[@class="course-summary__text"]/p[@class="course-summary__entry"]/text()').extract() start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) item['start_date']=start_date duration=response.xpath('//div[@class="course-summary__teaching"]/p[1]/text()').extract() try: duration=clear_duration(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] except: pass overview=response.xpath('//div[@id="overview"]').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview career=response.xpath('//div[@id="careers"]').extract() career=remove_class(career) # print(career) item['career_en']=career modules=response.xpath('//div[@id="modulesassessment"]').extract() modules=remove_class(modules) item['modules_en']=remove_class(modules) item['ielts']='6.5' item['ielts_l']='5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['ielts_desc']='Our standard entry criteria for postgraduate courses is IELTS 6.5 or equivalent, with nothing lower than 5.5 in any of the four elements (listening, speaking, reading and writing).' item['toefl']='88' item['toefl_l']='17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' item['toefl_desc']="TOEFL iBT with 88 overall and a minimum of 17 in Writing and Listening, 18 in Reading and 20 in Speaking" fee=response.xpath('//div[@id="feesfunding"]//text()').extract() tuition_fee=getTuition_fee(fee) # print(tuition_fee) if tuition_fee==2018: tuition_fee=0 item['tuition_fee']=tuition_fee item['tuition_fee_pre']='£' department=response.xpath('//a[contains(text(),"Visit your")]/@href').extract() # print(department) department=''.join(department).split('/')[-1] # print(department) department=department.title().replace('-',' ') # print(department) item['department']=department how_to_apply=["<p>Step 1 - Choose your course</p>", "<p>Step 2 - Submit your application form</p>", "<p>Step 3 - Check your email regularly</p>", "<p>Step 5 - Start your visa application</p>", "<p>Step 4 - Receive our decision on your application</p>",] how_to_apply='\n'.join(how_to_apply) item['apply_proces_en']=how_to_apply apply_d=["<ul><li>Qualification certificates and transcripts, including certified translations, where applicable</li>", "<li>A personal statement. You can download and complete our Personal Statement Form.</li>", "<li>References/recommendation letters</li>", "<li>Curriculum vitae/resume</li>", "<li>Passport</li>", "<li>Current and previous visa(s) (if applicable)</li>", "<li>Proof of name change (if applicable)</li>", "<li>Portfolio (if applicable)</li></ul>",] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d courseid=response.xpath('//input[@id="erastracode"]/@value').extract() # print(courseid) if courseid==['']: rntry=response.xpath('//h4[contains(text(),"ain")]/following-sibling::*').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry else: cid=re.findall('[A-Z0-9]+',courseid[0]) courseid='%20'.join(cid) rntry_url='https://www.anglia.ac.uk/api/coursewidget/multipleentryrequirements?academicYears=2017%2C2018&moaCode=FT&astraCode='+courseid # print(rntry_url) try: rntry_content=json.loads(requests.get(rntry_url).text)[0]['GroupItems'][0]['Text'][0] rntry_content='<div>'+rntry_content+'</div>' except: rntry_content='' item['rntry_requirements'] = rntry_content # print(rntry_content) # yield item
def pro_parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['url'] = response.url item['university'] = 'London South Bank University' item['location'] = 'London' item['tuition_fee_pre'] = '£' pro = response.xpath('//div[@id="breadcrumbs"]//span/text()').extract() prog = pro[-1].split('-') if len(prog) == 2: programme = prog[0] degree_type = prog[1] degree_type = degree_type.strip() item['degree_name'] = degree_type if degree_type[0] == 'M': item['degree_type'] = '2' elif degree_type[0] == 'P': item['degree_type'] = '3' else: programme = prog item['programme_en'] = programme fee = response.xpath( '//div[@id="tab_fees_and_funding"]//*[contains(text(),"£")]//text()' ).extract() # print(fee) tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee overview = response.xpath('//div[@id="tab_overview"]').extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="tab_modules"]').extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath('//div[@id="tab_employability"]').extract() career = remove_class(career) item['career_en'] = career rntry = response.xpath('//div[@id="tab_entry_requirements"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry ielts = get_ielts(rntry) # print(ielts) if ielts != [] and ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] apply_desc_en = response.xpath( '//div[@id="tab_how_to_apply"]').extract() apply_desc_en = remove_class(apply_desc_en) item['apply_desc_en'] = apply_desc_en duration = response.xpath( '//td/span[contains(text(),"Duration")]/following-sibling::div/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] mode = response.xpath( '//td/span[contains(text(),"Mode")]/following-sibling::div/text()' ).extract() mode = set(mode) mode = ''.join(mode) # print(mode) mode = re.findall('(?i)full', mode) if mode != []: item['teach_time'] = '1' else: item['teach_time'] = '2' start_date = response.xpath( '//td/span[contains(text(),"Start")]/following-sibling::div/text()' ).extract() # start_date=tracslateDate(start_date) # start_date=set(start_date) try: start_date = tracslateDate(start_date) start_date = list(set(start_date)) start_list = [] for i in start_date: start_list.append('2019' + '-' + i) start_date = ','.join(start_list) item['start_date'] = start_date except: pass item['department'] = ''.join( response.xpath( '//a[contains(text(),"School of")]/text()').extract()) yield item
def parse_main(self, response): print('进入一个详情页') # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Oxford Brookes University' item['url'] = response.url item['location'] = 'London' programme = response.xpath('//h1/text()').extract() programme = ''.join(programme).strip() # print(programme) item['programme_en'] = programme degree_name = response.xpath( '//h1/following-sibling::h2/text()').extract() degree_name = ''.join(degree_name).strip() # print(degree_name) item['degree_name'] = degree_name department = response.xpath( '//h1/following-sibling::h2/following-sibling::p/a/text()' ).extract() department = ''.join(department).strip() # print(department) item['department'] = department start_date = response.xpath( '//h3[contains(text(),"Available")]/following-sibling::p[1]/text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) item['start_date'] = start_date duration = response.xpath( '//h3[contains(text(),"Course length")]/following-sibling::ul//text()' ).extract() # print(duration) mode = re.findall('(?i)full', ''.join(duration)) if mode != []: item['teach_time'] = 'fulltime' else: item['teach_time'] = 'parttime' try: duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] except: pass overview = response.xpath( '//h1/following-sibling::h2/following-sibling::p/following-sibling::*' ).extract() overview = remove_class(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="section-two"]').extract() modules = remove_class(modules) item['modules_en'] = modules fee = response.xpath('//p[contains(text(),"£")]/text()').extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' rntry = response.xpath('//div[@id="section-four"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath('//div[@id="section-five"]').extract() career = remove_class(career) item['career_en'] = career ielts = response.xpath( '//*[contains(text(),"IELTS")]/text()').extract() ielts = ''.join(ielts) IELTS = ielts ielts = re.findall('\d\.\d', ielts) if len(ielts) == 2: # print('长度为二的ielts',ielts) ielts = list(map(float, ielts)) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min( ielts), min(ielts), min(ielts) elif len(ielts) == 3: # print('长度为三的ielts',ielts,IELTS) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = ielts[0], ielts[2], ielts[ 2], ielts[1], ielts[1] elif len(ielts) == 0: pass elif len(ielts) == 1: # print('长度为一的ielts',ielts) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = ielts[0], ielts[0], ielts[ 0], ielts[0], ielts[0] else: # print('其他长度的ielts',ielts,response.url) item['ielts'], item['ielts_l'], item['ielts_s'], item[ 'ielts_r'], item['ielts_w'] = max(ielts), min(ielts), min( ielts), min(ielts), min(ielts) # print(item) yield item
def parse_main(self, response): item = get_item1(ScrapyschoolEnglandItem1) # print(response.url) item['university'] = "City, University of London" item['url'] = response.url item['location'] = 'London' item['programme_en'] = response.meta['programme'] item['degree_name'] = response.meta['degree_name'] item['tuition_fee_pre'] = '£' item['teach_type'] = 'taught' department = response.meta['department'] department = set(department) department = ' '.join(department) item['department'] = department fee = response.xpath( '//h3[contains(text(),"Fee")]/../../following-sibling::div//text()' ).extract() tuition_fee = getTuition_fee(fee) if tuition_fee == 0: fee = response.xpath( '//span[contains(text(),"£")]//text()').extract() tuition_fee = getTuition_fee(fee) item['tuition_fee'] = tuition_fee # print(item['tuition_fee']) overview = response.xpath( '//h2[contains(text(),"Who is it")]/following-sibling::*|' '//h2[contains(text(),"Overview")]/following-sibling::*').extract( ) overview = remove_class(overview) overview = clear_same_s(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//h2[contains(text(),"Structure")]/following-sibling::*|' '//h2[contains(text(),"Modules")]/following-sibling::*').extract() modules = remove_class(modules) modules = clear_same_s(modules) # print(modules) item['modules_en'] = modules rntry_requirement = response.xpath( '//h3[contains(text(),"Entry")]/following-sibling::*|//div[@id="entryreq"]' ).extract() rntry_requirement = remove_class(rntry_requirement) rntry_requirement = clear_same_s(rntry_requirement) # print(rntry_requirement) item['rntry_requirements'] = rntry_requirement ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::*').extract() # print(career) career = remove_class(career) career = clear_same_s(career) item['career_en'] = career # print(career) duration = response.xpath( '//span[contains(text(),"Duration")]/../following-sibling::div//text()|' '//h3[contains(text(),"Duration")]/following-sibling::*//text()' ).extract() mode = re.findall('(?i)full', ''.join(duration)) if mode != []: item['teach_time'] = '1' else: item['teach_time'] = '2' # print(''.join(duration)) duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//h3[contains(text(),"Start date")]/following-sibling::p/text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date # print(start_date) apply_desc_en = response.xpath( '//h3[contains(text(),"How to apply")]/following-sibling::*|//div[@id="howtoapply"]' ).extract() apply_desc_en = remove_class(apply_desc_en) item['apply_proces_en'] = apply_desc_en require_chinese = "<p>Applicants will be considered for most postgraduate courses with a good Chinese bachelor’s degree from a recognised University.Students who don’t meet the requirements for direct entry may have the option to undertake our Graduate Diploma programme at INTO City, which then offers the opportunity for guaranteed entry into City’s Masters programmes.</p>" item['require_chinese_en'] = require_chinese assessment = response.xpath( '//h2[contains(text(),"Teaching and learning")]/following-sibling::*|//h3[contains(text(),"ssessment")]/following-sibling::*' ).extract() # if assessment==[]: # print(response.url) # else: # print('不为空') item['assessment_en'] = remove_class(assessment)
def parse_main(self,response): item=get_item1(ScrapyschoolEnglandItem1) print(response.url) item['university'] = 'Middlesex University' item['url'] = response.url item['location'] = 'London' programme=response.xpath('//div[@class="course-page-banner__texts"]/h1/text()').extract() # print(programme) programme=''.join(programme) degree_name=re.findall('[A-Z]{2,}.*',programme) # print(degree_name) degree_name=''.join(degree_name) if degree_name!=programme: programme=programme.replace(degree_name,'') # print(programme) # print(degree_name) item['programme_en'] = programme item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass start_date=response.xpath('//span[contains(text(),"Start")]/../following-sibling::div//text()').extract() # print(start_date) start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) item['start_date'] = start_date duration=response.xpath('//span[contains(text(),"Duration")]/../following-sibling::div//text()').extract() mode=re.findall('(?i)full',''.join(duration)) duration=clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] if mode !=[]: item['teach_time']='1' else: item['teach_time']='2' fee = response.xpath('//span[contains(text(),"Fees")]/../following-sibling::div//text()').extract() tuition_fee=getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' overview=response.xpath('//h2[contains(text(),"Overview")]/following-sibling::*').extract() overview=remove_class(overview) # print(overview) item['overview_en'] = overview modules=response.xpath('//h2[contains(text(),"Course content")]/following-sibling::*').extract() modules=remove_class(modules) # print(modules) item['modules_en'] = modules rntry=response.xpath('//h2[contains(text(),"Entry requirements")]/following-sibling::*').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry ielts=response.xpath('//p[contains(text(),"IELTS")]//text()').extract() ielts=''.join(ielts) item['ielts_desc']=ielts ielts=get_ielts(ielts) # print(ielts) try: if ielts!=[] or ielts!={}: item['ielts_l']=ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass career=response.xpath('//h2[contains(text(),"Careers")]/following-sibling::*').extract() career=remove_class(career) # print(career) item['career_en'] = career yield item
def parse(self, response): # print(response.url) item = get_item1(ScrapyschoolEnglandItem1) university = 'University of York' item['university'] = university item['url'] = response.url item['location'] = 'York' item['tuition_fee_pre'] = '£' start_date = response.xpath( '//h4[contains(text(),"Start date")]/following-sibling::p//text()' ).extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date overview = response.xpath( '//div[@class="o-grid__box o-grid__box--half o-grid__box--half@medium"]|' '//h2[contains(text(),"verview")]/following-sibling::*|' '//h2[contains(text(),"At a glance")]/following-sibling::*|' '//h2[contains(text(),"Course summary")]/following-sibling::*|' '//h2[contains(text(),"At a Glance")]/following-sibling::*|' '//div[@id="mdcolumn"]/h1/following-sibling::*[position()<5]' ).extract() overview = remove_class(overview) item['overview_en'] = overview # print(overview) modules = response.xpath( '//div[@id="content_modules"]|' '//h2[contains(text(),"Course structure")]/following-sibling::*|' '//th[contains(text(),"Module")]/../../..|' '//h2[contains(text(),"ontent")]/following-sibling::*|' '//h3[contains(text(),"What does the course cover?")]/following-sibling::p[1]|' '//strong[contains(text(),"Course structure")]/../following-sibling::*[position()<=5]|' '//h2[contains(text(),"Structure and ethos")]/..|' '//h2[contains(text(),"Modules")]/following-sibling::*|' '//h2[contains(text(),"Structure and Ethos")]/following-sibling::*|' '//h2[contains(text(),"module")]/following-sibling::*').extract() modules = remove_class(modules) item['modules_en'] = modules # print(modules) tuition_fee = response.xpath( '//div[@id="fees"]/following-sibling::div[1]//*[contains(text(),"£")]//text()' ).extract() tuition_fee = getTuition_fee(tuition_fee) item['tuition_fee'] = tuition_fee # print(tuition_fee) assessment = response.xpath( '//h2[contains(text(),"Teaching and assessment")]/../../following-sibling::div[1]' '|//h2[contains(text(),"ssessment")]/following-sibling::*|' '//h2[contains(text(),"ssessment")]/following-sibling::*[position()<=5]|' '//strong[contains(text(),"Specialist training tailored to your interests and aspirations")]/../following-sibling::*|' '//span[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|' '//h3[contains(text(),"ssessment")]/following-sibling::*[position()<=3]|' '//strong[contains(text(),"SUMMER TERM")]/../following-sibling::*|' '//strong[contains(text(),"ssessment")]/../following-sibling::*[position()<=3]|' '//h2[contains(text(),"Teaching")]/following-sibling::*|' '//blockquote[@class="rightBox"]/following-sibling::*[1]|' '//h2[contains(text(),"Dissertation")]/following-sibling::p[1]|' '//p[contains(text(),"This programme aims: ")]/following-sibling::table[1]' ).extract() # if assessment==[]: # print(response.url) assessment = remove_class(assessment) item['assessment_en'] = assessment # print(assessment) entry_requirements = response.xpath( '//div[@id="entry"]|' '//h2[contains(text(),"requirement")]/following-sibling::*|' '//h2[contains(text(),"pplicants")]/following-sibling::*|' '//h3[contains(text(),"Entry Requirements")]/following-sibling::*|' '//h2[contains(text(),"Entry")]/following-sibling::*[position()>1]|' '//h3[contains(text(),"International students")]/following-sibling::*|' '//h3[contains(text(),"Entry requirements")]/following-sibling::*[position()<4]|' '//h2[contains(text(),"English Language Requirements")]/following-sibling::*[position()<3]' ).extract() # if entry_requirements==[]: # print(response.url) entry_requirements = remove_class(entry_requirements) item['rntry_requirements'] = entry_requirements # print(entry_requirements) ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] toefl = response.xpath( '//*[contains(text(),"TOEFL")]//text()').extract() toefl = ''.join(toefl).strip() item['toefl_desc'] = toefl toefl = re.findall('\d{2,3}', toefl) if len(toefl) == 2: toefl = list(map(int, toefl)) item['toefl'] = max(toefl) item['toefl_l'] = min(toefl) item['toefl_w'] = min(toefl) item['toefl_r'] = min(toefl) item['toefl_s'] = min(toefl) career = response.xpath( '//div[@class="o-grid__box o-grid__box--half"]|' '//h2[contains(text(),"areer")]/following-sibling::*|' '//h2[contains(text(),"Employment relevance")]/following-sibling::*|' '//p[contains(text(),"employment,")]/following-sibling::ul[1]|' '//p[contains(text(),"This programme aims: ")]/following-sibling::ul[1]|' '//h3[contains(text(),"areers")]/following-sibling::ul[1]|' '//h2[contains(text(),"Employment outcomes")]/following-sibling::*|' '//h3[contains(text(),"What can it lead to?")]/following-sibling::p[1]' ).extract() # if career==[]: # print(response.url) career = remove_class(career) # print(career) item['career_en'] = career departnemt = response.xpath( '//h4[contains(text(),"Department")]/following-sibling::p//text()|//div[@id="location"]/h1//text()' ).extract() departnemt = ''.join(departnemt) item['department'] = departnemt # pro = response.meta['programme'] # item['programme_en'] = pro # duration = response.meta['duration'] # print(duration) # duration = clear_duration(duration) # item['duration'] = duration['duration'] # item['duration_per'] = duration['duration_per'] programme = response.xpath( '//div[@id="mdcolumn"]/h1/text()|//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()' ).extract() # print(programme) clears = re.findall('[A-Za-z]+ in ', ''.join(programme)) programme = ''.join(programme).replace(''.join(clears), '').strip() item['programme_en'] = programme duration = response.xpath( '//h4[contains(text(),"Length")]/following-sibling::p//text()' ).extract() # print(duration) duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] major_type1 = response.xpath( '//div[@class="c-figure__content c-figure__content--left c-figure__content--half"]/h1/text()|//div[@id="content-container"]//h1/text()' ).extract() major_type1 = ''.join(major_type1) item['major_type1'] = major_type1 # if 'diploma' not in response.url: # print(response.url) # print(major_type1) degree_name = re.findall('[A-Z]{2}[a-zA-Z]*', major_type1) # print(degree_name) degree_name = '/'.join(degree_name).strip() item['degree_name'] = degree_name
def programme(self, response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) programme = response.xpath('//h1/text()').extract() # print(programme) deg = re.findall('\(.*\)', ''.join(programme)) clears = re.findall(':.*', ''.join(programme)) # print(deg) deg = ''.join(deg) programme = ''.join(programme).replace(''.join(clears), '').replace(deg, '').strip() # print(programme) item['programme_en'] = programme item['degree_name'] = deg.replace('(', '').replace(')', '').strip() item['url'] = response.url start_date = response.xpath('//dt[contains(text(),"tart date")]/following-sibling::dd[1]//text()').extract() start_date = tracslateDate(start_date) item['start_date'] = ','.join(start_date) item['university'] = 'Birkbeck, University of London' # item['tuition_fee_pre']='£' item['location'] = ''.join( response.xpath('//dt[contains(text(),"ocation")]/following-sibling::dd[1]//text()').extract()) duration = response.xpath('//dt[contains(text(),"uration")]/following-sibling::dd[1]//text()').extract() # print(duration) mode = re.findall('(?i)full', ''.join(duration)) # if mode!=[]: # print('这个专业要') # else: # print('这个专业只有兼职,不要!!!') dura = re.findall('[a-zA-Z0-9\s]+full', ''.join(duration)) dura = clear_duration(dura) # print(dura) item['duration'] = dura['duration'] item['duration_per'] = dura['duration_per'] overview = response.xpath('//h2[contains(text(),"Highlights")]/preceding-sibling::div[1]').extract() overview = remove_class(overview) item['overview_en'] = overview # print(overview) modules = response.xpath('//h2[contains(text(),"Course structure")]/following-sibling::section').extract() modules = remove_class(modules) item['modules_en'] = modules # print(modules) # if modules=='': # print(response.url) entry = response.xpath('//h2[contains(text(),"ntry requirements")]/following-sibling::*').extract() entry = remove_class(entry) # print(entry) item['rntry_requirements']=entry chinese = ['<h3 class="content-show">Postgraduate entry requirements</h3>', "<ul><li>Please <a>check your postgraduate course online</a> to see if your programme of study has an entry requirement of a UK undergraduate degree with a 2:1 or a 2:2 classification. </li><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:2 classification</strong>, you will typically need to have one of the following:</li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 70% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 75% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 75% </li><li>a Master's degree with an overall average grade of 60%. </li></ul><li>To study a Master's degree that requires a UK undergraduate degree with a <strong>2:1 classification</strong>, you will typically need to have one of the following: </li><ul><li>a Bachelor's degree (<i>Xueshi</i><span>) from a 211, 985 or top national university with an overall average grade of 75% </span></li><li>a Bachelor's degree from a national university with an overall average grade of 80% </li><li>a Bachelor's degree from a high-ranking private university with an overall average grade of 80% </li><li>a Master's degree with an overall average grade of 70%. </li></ul><li>If you do not meet these criteria, you can apply for Birkbeck’s <a>International Foundation Programme</a><span>, which acts as a bridge between undergraduate and postgraduate study, preparing students to study a Master’s degree in the UK. There are progression pathways onto various courses at Birkbeck.</span></li><li>Another option is the <a>Master's Foundation programme</a>, at our partner provider OnCampus London, which is available for two- or three-term progression onto a wide range of Master’s Degrees at Birkbeck.</li><li>If your transcript is provided in GPA format and not a percentage value, <a>please contact our International Office</a> to check your equivalency. For most institutions: </li><ul><li>80% is equivalent to 4/5 or 3.3/4 </li><li>75% is equivalent to 3.5/5 or 2.7/4. </li></ul>"] item['require_chinese_en'] = remove_class(chinese) item['toefl_desc'] = 'overall score of 92, with 22 in Reading, 21 in Listening, 23 in Speaking, 24 in Writing.' item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'] = '22', '23', '22', '24' ielts = 'overall score of 6.5, with 6.0 in each subtest' ielts = response.xpath('//*[contains(text(),"IELTS")]//text()').extract() # print(ielts) ies = re.findall('\d\.?\d?', ''.join(ielts)) # print(ies) if len(ies) == 2: ies = list(map(float, ies)) item['ielts'] = max(ies) item['ielts_l'] = min(ies) item['ielts_s'] = min(ies) item['ielts_r'] = min(ies) item['ielts_w'] = min(ies) item['ielts_desc'] = '\n'.join(ielts).strip() fee = response.xpath('//h2[contains(text(),"Fees")]/following-sibling::p/text()').extract() # print(fee) assessment = response.xpath('//h2[contains(text(),"Assessment")]/following-sibling::*').extract() assessment = remove_class(assessment) item['assessment_en'] = assessment department = response.xpath('//a[contains(text(),"isit the")]/text()').extract() # print(department) department = ''.join(department).replace('Visit the', '').strip() # print(department) item['department'] = department howtoapply = response.xpath('//h2[contains(text(),"How to apply")]/following-sibling::*').extract() howtoapply = remove_class(howtoapply) # print(howtoapply) item['apply_proces_en'] = howtoapply # print(item) if mode!=[]: print('这个专业要') yield item else: print('这个专业只有兼职,不要!!!')
def parse(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'University for the Creative Arts' item['url'] = response.url programme = response.xpath('//h1/text()').extract() programme = ''.join(programme) # print(programme) item['programme_en'] = programme degr = response.xpath('//h1/following-sibling::p[1]/text()').extract() # print(degr) degr = ''.join(degr).split('-') if len(degr) == 3: # print(degr) degree_name = degr[0] location = degr[1] item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass elif len(degr) == 4: # print(degr) item['degree_name'] = 'Pre-degree' item['degree_type'] = '2' duration = response.xpath( '//p[contains(text(),"Length of study")]/following-sibling::p/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] location = response.xpath( '//p[contains(text(),"Campus")]/following-sibling::p/text()' ).extract() location = ''.join(location) item['location'] = location start_date = response.xpath( '//p[contains(text(),"Start month")]/following-sibling::p/text()' ).extract() start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(start_date) item['start_date'] = start_date overview = response.xpath('//div[@class="cell overview"]').extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="syllabus"]/following-sibling::section[@class="article-content-area"][1]' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules career = response.xpath( '//div[contains(text(),"Career")]/following-sibling::div').extract( ) career = remove_class(career) # print(career) item['career_en'] = career item['ielts'] = '6' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' rntry = [ "We will consider equivalent qualifications from your home country for entry onto our Foundation, Bachelor’s and Master’s courses. Please see below for details of the accepted qualifications (including English language qualifications) for each level of course. Each application we receive is considered individually and therefore these qualifications are provided as a guide.", "For our International Foundation in Art, Design and Media, we usually require that you have one of the following:", "Chinese Senior School graduation with 12 years of completed school study, with an average of 65% or above.", "Pre Foundation course at Guildford College Training School (China).", "For our Bachelor's courses, we usually require that you have:", "Chinese Senior School graduation with 12 years of completed school study, plus a recognised Foundation course.", "A transcript showing successful completion of one year of university study at a recognised Chinese university with average of 70%.", "For our Master's courses, we usually require that you have:", "Bachelor's degree with 80% average grade from a recognised Chinese university.", ] rntry = '\n'.join(rntry) rntry = response.xpath( '//h3[contains(text(),"UK entry requirements")]/following-sibling::*' ).extract() item['rntry_requirements'] = remove_class(rntry) portfolio = response.xpath( '//h3[contains(text(),"Your portfolio")]/following-sibling::*' ).extract() item['portfolio_desc_en'] = remove_class(portfolio) item['tuition_fee'] = '13540' item['tuition_fee_pre'] = '£' item['deadline'] = '2019-3' # print(item) yield item
def parse_main(self,response): print(response.url) item=get_item1(ScrapyschoolEnglandItem1) item['university']='University of Bedfordshire' item['url']=response.url programme=response.xpath('//div[@id="inner-course-content"]/h1/text()').extract() # print(programme) programme=''.join(programme) # print(programme) item['tuition_fee_pre']='£' if 'MBA' in programme: # print(programme) item['tuition_fee']='14000' else: item['tuition_fee']='12750' programme=programme.split('-') if len(programme)==2: prog=programme[0].strip() degr=programme[1].strip() # print(prog) # print(degr) item['degree_name']=degr try: if degr[0] == 'M': item['degree_type'] = '2' elif degr[0] == 'P': item['degree_type'] = '3' except: pass else: prog=''.join(programme).strip() item['programme_en']=prog location=response.xpath('//strong[contains(text(),"Campus Location")]/../text()').extract() location=''.join(location).replace('-','').strip() # print(location) item['location']=location duration=response.xpath('//strong[contains(text(),"Duration")]/../text()').extract() duration=clear_duration(duration) # print(duration) item['duration']=duration['duration'] item['duration_per']=duration['duration_per'] mode=response.xpath('//strong[contains(text(),"Attendance")]/../text()').extract() mode=''.join(mode) mode=re.findall('(?i)full',mode) if mode!=[]: item['teach_time']='1' else: item['teach_time']='2' start_date=response.xpath('//strong[contains(text(),"Start")]/../text()').extract() # print(start_date) start_date=tracslateDate(start_date) # print(start_date) start_date=','.join(start_date) # print(start_date) item['start_date']=start_date overview=response.xpath('//div[@id="why_content"]').extract() overview=remove_class(overview) # print(overview) item['overview_en']=overview modules=response.xpath('//div[@id="unit_content"]').extract() modules=remove_class(modules) # print(modules) item['modules_en']=modules assessment_en=response.xpath('//div[@id="how_content"]').extract() assessment_en=remove_class(assessment_en) item['assessment_en']=assessment_en rntry=response.xpath('//h2[@id="entry"]/following-sibling::div/ul[@class="tab-content"]/div[3]').extract() rntry=remove_class(rntry) # print(rntry) item['rntry_requirements']=rntry item['ielts']='6.0' item['ielts_l']='5.5' item['ielts_s']='5.5' item['ielts_r']='5.5' item['ielts_w']='5.5' # item['toefl']='80' item['toefl_l']='17' item['toefl_s']='20' item['toefl_r']='18' item['toefl_w']='17' career=response.xpath('//div[@id="career_content"]').extract() career=remove_class(career) # print(career) item['career_en']=career apply_d=['<p>There are two ways you can make a direct application to the University of Bedfordshire:</p><ul><li><a href="https://evision.beds.ac.uk/urd/sits.urd/run/siw_ipp_lgn.login?process=siw_ipp_app&code1=OA_FORM&code2=0007">Apply online now for 2017/18</a> Courses starting from 1 August 2017 to 31 July 2018</li><li>Download <span class="include_asset_summary"><a href="https://www.beds.ac.uk/__data/assets/pdf_file/0006/441798/International-Application-web-2018.pdf">an application form - <img src="https://www.beds.ac.uk/__data/asset_types/pdf_file/icon.png" alt="" title="" height="16" width="16" class="sq-icon" /> PDF 1.0 MB ', '</a></span> and submit it to our <a href="https://www.beds.ac.uk/international/international-applications/contactus">Admissions Team</a> along with scans of your supporting documents, via email, post or in person at the International Office.</li></ul><p>You can post your completed form to:</p><p>University of Bedfordshire International Admissions/International Office/University Square/Luton/Bedfordshire/LU1 3JU/United Kingdom</p><h4>Please note</h4><ul><li><strong>BSc (Hons) Nursing Studies</strong> Level 3 and <strong>MSc Advanced Nursing Studies</strong> are available to overseas students - please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a></li><li><strong>Healthcare, Nursing and Midwifery students</strong> - many of these courses are not available to overseas students due to UK immigration law in regard to bursary funding. Please contact <a href="https://www.beds.ac.uk/international/international-applications/contactus">International Admissions</a> to find out if you are eligible to apply.</li></ul><p>*Please note that international students studying on a Tier 4 Student Visa must choose a full-time Undergraduate or Postgraduate course and are not eligible for part-time study.</p><p>Watch some more tips and advice on making your application to Bedfordshire:</p>',] apply_d='\n'.join(apply_d) item['apply_documents_en']=apply_d # item['application_open_date']='2018-8' # item['deadline']='2019-7' # print(item) yield item
def parses(self, response): # print('进入专业链接页面',response.url) item = get_item1(ScrapyschoolEnglandItem1) item['url'] = response.url item['university'] = 'Buckinghamshire New University' location = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Location")]/text()' ).extract() location = ''.join(location).replace('Location:', '').strip() # print(location) programme = response.xpath( '//h1[@class="banner-title"]/text()').extract() item['programme_en'] = ''.join(programme).strip() degree_name = response.xpath( '//p[@class="school-code"]/text()').extract() item['degree_name'] = ''.join(degree_name).strip() item['location'] = location duration = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Duration")]/text()' ).extract() duration = clear_duration(duration) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] start_date = response.xpath( '//ul[@class="course-details"]/li[contains(text(),"Start Date")]/text()' ).extract() start_date = tracslateDate(start_date) # print(start_date) overview = response.xpath( '//h2[contains(text(),"Course Overview")]/..').extract() item['overview_en'] = remove_class(overview) modules = response.xpath( '//h2[contains(text(),"Course Modules")]/..').extract() item['modules_en'] = remove_class(modules) career = response.xpath( '//h2[contains(text(),"Employability")]/..').extract() item['career_en'] = remove_class(career) entry = response.xpath( '//h3[contains(text(),"What are the course entry requirements?")]/following-sibling::p[position()<=3]' ).extract() if entry == []: print(response.url) else: print(entry) item['rntry_requirements'] = remove_class(entry) item['tuition_fee'] = '11500' # item['apply_desc_en']=remove_class(entry) chi = [ ' <div> ', ' <p>Academic entry requirements</p ><p>We require successful completion of a 学士学位 (Bachelor degree) or successful completion of a three-year 本科毕业证书 (Benke) with an overall pass from a UK NARIC-recognised or Ministry of Education-listed institution.</p ><p>Mathematics entry requirements</p ><p>Students need the equivalent of GCSE Mathematics grade C/4.</p > ', ' </div> ', ] htp = [ '<p>There’s still time to apply for September 2018. Visit our <a hre>clearing section</a> to find out more.</p><p><strong>Check you meet the entry requirements</strong></p><p>Once you’ve had a good look at our course information, and chosen which one feels right for you, before applying it’s worth checking that you meet the entry requirements for your country.</p><p>We welcome applications from students with a wide range of qualifications from around the world. You’ll find details of the exact academic and English language requirements for your country on our <a hre>country pages</a>.</p><p>Every student studying with us also needs to meet our <a hre>English language requirements</a> and we will ask you to provide evidence to show you have good enough English to study a higher education course in the UK.</p><p><strong>Different ways to apply</strong></p><p>When you are ready to apply for your course, you can do so in one of three ways:</p><ul><li>directly through our <a href="https://www.applycpd.com/bucks?tabid=21">application portal</a></li><li>through <a hre>UCAS</a>, or</li><li>through a recruitment agent in your country (see <a hre>your country page</a> for details of agents we work with who are operating locally to you).</li></ul><p>It doesn’t matter which of these routes you use, but we advise you to apply early to give yourself enough time to prepare for moving to the UK and arranging your visa, if you need one.</p><p>If you’ve missed out on your first choices, declined any offers made to you, or you’re applying to university after 30 June, you can also apply to us through <a hre>Clearing</a>.</p>', ] item['require_chinese_en'] = remove_class(chi) item['apply_desc_en'] = remove_class(htp) item['ielts'] = '6.0' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' yield item
def parses(self, response): item = get_item1(ScrapyschoolEnglandItem1) # print('接受了') print('开始下载', response.url, '的数据') # print(response.status) item['university'] = 'Manchester Metropolitan University' item['url'] = response.url item['location'] = 'Manchester' degree_name = response.xpath('//h1/span/text()').extract() degree_name = ''.join(degree_name) item['degree_name'] = degree_name programme = response.xpath('//h1/text()').extract() # print(programme) programme = ''.join(programme).strip() item['programme_en'] = programme # print(degree_name) # print(programme) item['degree_type'] = 2 overview = response.xpath( '//h2[contains(text(),"Overview")]/following-sibling::article' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::p').extract() career = remove_class(career) item['career_en'] = career rntry = response.xpath( '//h2[contains(text(),"Entry")]/following-sibling::p').extract() ieltssss = re.findall('\d\.?\d?', ''.join(rntry)) print(ieltssss) rntry = remove_class(rntry) item['rntry_requirements'] = rntry modules = response.xpath( '//h2[contains(text(),"Course")]/following-sibling::div').extract( ) modules = remove_class(modules) item['modules_en'] = modules fee = response.xpath('//*[contains(text(),"£")]//text()').extract() tuition = getTuition_fee(fee) # print(tuition) item['tuition_fee'] = tuition item['tuition_fee_pre'] = '£' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['ielts'] = '6.5' item[ 'ielts_desc'] = 'For Postgraduate courses, we usually ask for IELTS 6.5 (No less than 5.5 in any section) or equivalent.' item[ 'toefl_desc'] = 'Overall score: 89 With no individual test score below: Listening: 17 Reading: 18 Speaking: 20 Writing : 17' item['toefl'] = '89' item['toefl_l'] = '17' item['toefl_s'] = '20' item['toefl_r'] = '18' item['toefl_w'] = '17' turation = response.xpath( '//li[contains(text(),"Length")]/span//text()').extract() duration = clear_duration(turation) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] ieltsopen = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() # print(ieltsopen) start_date = response.xpath( '//li[contains(text(),"Start")]/span//text()').extract() start_date = tracslateDate(start_date) start_date = ','.join(start_date) item['start_date'] = start_date item['department'] = ''.join( response.xpath( '//span[@id="department_name"]/text()').extract()).strip() if response.status == 404: print("****404****") with open("errorurl.txt", 'a+') as f: f.write(response.url + "\n") else: yield item
def parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['location'] = 'Newcastle' item['university'] = 'Northumbria University' item['url'] = response.url programme = response.xpath( '//div[@class="col-sm-6"]/h1/text()|//div[@class="hero-content"]/h1/text()|//header[@class="course-heading"]/h1/text()' ).extract() programme = ''.join(programme).strip() degree_name = re.findall('[A-Z]{2,}.*', programme) degree_name = ''.join(degree_name) if degree_name != programme: programme = programme.replace(degree_name, '') item['programme_en'] = programme item['degree_name'] = degree_name try: if degree_name[0] == 'M': item['degree_type'] = '2' elif degree_name[0] == 'P': item['degree_type'] = '3' except: pass dur = response.xpath( '//strong[contains(text(),"Mode")]/../text()|//span[contains(text(),"uration")]/../text()' ).extract() # print(dur) duration = clear_duration(dur) # print(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] item['teach_time'] = '1' start_date = response.xpath( '//strong[contains(text(),"Start")]/../text()|//span[contains(text(),"Start")]/../text()' ).extract() start_date = list(set(start_date)) # print(start_date) start_date = tracslateDate(start_date) # print(start_date) start_date = ','.join(start_date) item['start_date'] = start_date deadline = response.xpath( '//span[contains(text(),"deadline")]/../text()').extract() deadline = list(set(deadline)) # print(deadline) deadline = tracslateDate(deadline) # print(deadline) deadline = ''.join(deadline) item['deadline'] = deadline ielts = response.xpath( '//*[contains(text(),"IELTS")]/text()').extract() item['ielts_desc'] = ''.join(ielts).strip() ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass if ielts == []: ielts = response.xpath( '//*[contains(text(),"English Language requirements")]/../text()' ).extract() ielts = get_ielts(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass # print(ielts) overview = response.xpath( '//div[@id="tab-0"]//div[@class="rich-text"]|//h3[contains(text(),"Overview")]/following-sibling::p' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//div[@id="tab-1"]//div[@class="rich-text"]|//div[@id="modules"]' ).extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules rntry = response.xpath( '//*[contains(text(),"English Language requirements")]/..' ).extract() rntry = remove_class(rntry) # print(rntry) item['rntry_requirements'] = rntry howtoapply = response.xpath('//div[@id="how-to-apply"]').extract() howtoapply = remove_class(howtoapply) item['apply_proces_en'] = howtoapply department = response.xpath( '//strong[contains(text(),"Department")]/../text()').extract() department = ''.join(department).strip() item['department'] = department fee = response.xpath('//*[contains(text(),"£")]//text()').extract() # print(fee) tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' career = response.xpath( '//h1[contains(text(),"career")]/../following-sibling::div|//div[@id="tab-5"]' ).extract() career = remove_class(career) # print(career) item['career_en'] = career