def parse_apply_proces_en(self, how_to_apply_url): data = requests.get(how_to_apply_url, headers=self.headers) response = etree.HTML(data.text) # print(response) apply_proces_en = response.xpath( "//div[@class='field field-name-field-gao-course-apply field-type-text-long field-label-hidden']" ) # 将Element转换成HTML格式 apply = "" if len(apply_proces_en) > 0: apply = etree.tostring(apply_proces_en[0], encoding='unicode', pretty_print=False, method='html') apply = remove_class(clear_space_str(apply)) apply_documents_en = response.xpath( '//h2[contains(text(),"Things")]/preceding-sibling::*[1]/following-sibling::*' ) # 将Element转换成HTML格式 apply_documents = "" if len(apply_documents_en) > 0: for d in apply_documents_en: apply_documents += etree.tostring(d, encoding='unicode', pretty_print=False, method='html') apply_documents = remove_class(clear_space_str(apply_documents)) return [apply, apply_documents]
def get_modules2(self, modules2url): data = requests.get(modules2url, headers=self.headers) response = etree.HTML(data.text) modules2 = response.xpath("/html/body/div[@class='container']") m2 = etree.tostring(modules2[0], encoding='unicode', pretty_print=False, method='html') m2 = remove_class(clear_space_str(m2)) return m2
def parse_apply_proces_en(self, how_to_apply_url): data = requests.get(how_to_apply_url, headers=self.headers) response = etree.HTML(data.text) # print(response) apply_proces_en = response.xpath( "//div[@class='layout-row intro summary']") # 将Element转换成HTML格式 apply = etree.tostring(apply_proces_en[0], encoding='unicode', pretty_print=False, method='html') apply = remove_class(clear_space_str(apply)) return apply
def parse_assessment_en(self, teaching_assessment_url): data = requests.get(teaching_assessment_url, headers=self.headers) response = etree.HTML(data.text) # print(response) assessment_en = response.xpath( "//div[@class='field field-name-field-gao-course-study field-type-text-long field-label-hidden']" ) ass = etree.tostring(assessment_en[0], encoding='unicode', pretty_print=False, method='html') # print("************", assessment_en) # print(ass) ass = remove_class(clear_space_str(ass)) # print(ass) return ass
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Kingston University' # print(university) #2.url url = response.url # print(url) #3.programme_en #4.degree_name programme_en = response.xpath('//*[@id="middle-col"]/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) programme_en = programme_en.replace('& ','') if '(' in programme_en: degree_name_a = re.findall(r'[A-Za-z/]+\(Hons\)',programme_en)[0] degree_name = degree_name_a.replace('(Hons)','') else: degree_name_a = '' degree_name = '' if len(degree_name_a)!=0: programme_en = programme_en.replace(degree_name_a,'') programme_en = programme_en.replace(' ',' ') # print(programme_en) # print(degree_name) #5.degree_type degree_type = 1 #6.start_date start_date = '2018-9,2019-1,2019-4' #7.overview_en overview_en = response.xpath("//h2[contains(text(),'What you will study')]/preceding-sibling::*").extract() overview_en = ''.join(overview_en) overview_en = clear_space_str(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #8.assessment_en assessment_url = url +'teaching-learning-assessment.html' # print(assessment_url) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} data = requests.get(assessment_url, headers=headers) response_assessment_en = etree.HTML(data.text) assessment_en = response_assessment_en.xpath('//*[@id="middle-col"]/div[2]/p//text()') assessment_en = ''.join(assessment_en) # print(assessment_en) #9.modules_en modules_en =response.xpath('//*[@id="modulelist"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #10.alevel alevel_url = url+'entry-requirements.html' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} data = requests.get(alevel_url, headers=headers) response_alevel = etree.HTML(data.text) alevel = response_alevel.xpath("//*[contains(text(),'evel')]/.//text()") alevel = ''.join(alevel) # print(alevel,url) #11.ielts 12131415 if 'Health' in programme_en: ielts = 6.5 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Social Care' in programme_en: ielts = 6.5 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Education' in programme_en: ielts = 6.5 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Journalism' in programme_en: ielts = 6.5 ielts_r = 5.5 ielts_w = 6.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Nursing' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Nutrition' in programme_en: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_l = 6.0 ielts_s = 6.0 else: ielts = 6.0 ielts_r = 5.5 ielts_w = 6.5 ielts_l = 5.5 ielts_s = 5.5 #16.tuition_fee tuition_fee_url = url+'fees-and-funding.html' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} data = requests.get(tuition_fee_url, headers=headers) response_tuition_fee = etree.HTML(data.text) tuition_fee = response_tuition_fee.xpath('//*[@id="middle-col"]/div[2]/table/tbody/tr[3]/td[2]//text()') tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #17.tuition_fee_pre tuition_fee_pre = '£' #18.apply_proces_en apply_proces_en = response.url +'apply-now.html' # print(apply_proces_en) #19.career_en career_en_url = url+'after-you-graduate.html' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} data = requests.get(career_en_url, headers=headers) response_career_en = etree.HTML(data.text) career_en = response_career_en.xpath("//h2[contains(text(),'Careers and progression')]/../..") doc = "" if len(career_en) > 0: for a in career_en: doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc = remove_class(doc) career_en = ''.join(doc) # print(career_en) #20.location location = 'London' #21.apply_pre apply_pre = '£' item['alevel'] = alevel item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_name'] = degree_name item['degree_type'] = degree_type item['start_date'] = start_date item['overview_en'] = overview_en item['assessment_en'] = assessment_en item['modules_en'] = modules_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_proces_en'] = apply_proces_en item['career_en'] = career_en item['location'] = location # 22.ucascode ucascode = response.xpath('//*[@id="middle-col"]//table//tr/td').extract() ucascode = ''.join(ucascode) # ucascode = remove_tags(ucascode) # ucascode = re.findall(r'<td>(.*)</td>',ucascode) ucas = re.findall('td>([A-Z][A-Z0-9]{3})', ucascode) # if len(ucas) == 1: # ucascode = ucas[0] # elif len(ucascode)>1: # ucascode = ''.join(ucas) # else:ucascode = '' print(ucas,'---') item['duration'] = None item['other'] = '' item['ucascode'] = '' if len(ucas) > 0: response_duration = [] for i in ucas: response_ucascode = i xpaths = '//*[contains(text(),' + str(response_ucascode) + ')]//preceding-sibling::td[contains(text(),"full time")]' response_duration = response.xpath(xpaths).extract() # print(response_duration, '===') if len(ucas) == len(response_duration): for j in range(len(ucas)): duration_major = response_duration[j].replace('<td>','').replace('</td>','') duration = re.findall('\d', duration_major)[0] item['duration'] = duration item['other'] = duration_major item['ucascode'] = ucas[j] print("==========================", str(j)) print(item['duration'] , '---') print(item['other'] , '---') print(item['ucascode'] , '---') yield item else: yield item else: yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'London Metropolitan University' # print(university) #2.url url = response.url # print(url) #3.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS code:')]//following-sibling::*" ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = clear_space_str(ucascode) # print(ucascode) #4.programme_en programme_en = response.xpath( '//*[@id="MainContent"]/div[1]/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #5.degree_type degree_type = 1 #6.degree_name # degree_name = re.findall(r'-\s(.*)',programme_en)[0] # programme_en = programme_en.replace(degree_name,'').replace('-','').strip() # print(degree_name) # print(programme_en) #7.alevel alevel = response.xpath( '//*[@id="entry-requirements"]/div/ul/li[1]').extract() alevel = ''.join(alevel) alevel = remove_tags(alevel) alevel = clear_space_str(alevel) # print(alevel) #8.overview_en overview_en = response.xpath( '//*[@id="LeftColumn"]/section/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #9.start_date start_date = '2018-8-18' #10.apply_pre apply_pre = '£' #11.duration try: duration = response.xpath( "//*[contains(text(),'September 2019 - Full-time')]//@data-duration" ).extract()[0] duration = ''.join(duration) if len(duration) == 0: duration = response.xpath( "//*[contains(text(),'September 2018 - Full-time')]//@data-duration" )[0] duration = ''.join(duration) except: duration = '' # print(duration,response.url) #12.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'September 2019 - Full-time')]//@data-cost" ).extract() tuition_fee = ''.join(tuition_fee) if len(tuition_fee) == 0: tuition_fee = response.xpath( "//*[contains(text(),'September 2018 - Full-time')]//@data-cost" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #13.location # location = response.xpath("//*[contains(text(),'Location')]//following-sibling::*[1]").extract()[0] # location = ''.join(location) # location = remove_tags(location).replace('Location:','').strip() # print(location) #14.apply_documents_en apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>' #15.modules_en modules_en = response.xpath('//*[@id="modular-structure"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en).replace('▼', '') # modules_en = clear_space_str(modules_en) # print(modules_en,url) #16.assessment_en assessment_en = response.xpath( "//h3[contains(text(),'Assessment')]//following-sibling::p[1]" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # assessment_en = clear_space_str(assessment_en) # print(assessment_en) #17.career_en career_en = response.xpath( '//*[@id="career-opportunities"]/div').extract() career_en = ''.join(career_en) # career_en = clear_space_str(career_en) career_en = remove_class(career_en) # print(career_en) #18.ielts,19.20.21.22 # if 'LLB' in degree_name: # ielts = 6.5 # ielts_r = 6.0 # ielts_l = 6.0 # ielts_w = 6.0 # ielts_s = 6.0 # elif 'BA Translation Year 2 entry' in degree_name: # ielts = 6.5 # ielts_r = 6.0 # ielts_l = 6.0 # ielts_w = 6.0 # ielts_s = 6.0 # elif 'BA Translation Year 3 entry' in degree_name: # ielts = 7 # ielts_r = 6.5 # ielts_l = 6.5 # ielts_w = 6.5 # ielts_s = 6.5 # elif 'BSc Biomedical Science (Leading to MD)' in degree_name: # ielts = 7 # ielts_r = 6.5 # ielts_l = 6.5 # ielts_w = 6.5 # ielts_s = 6.5 # else: # ielts = 6 # ielts_r = 5.5 # ielts_l = 5.5 # ielts_w = 5.5 # ielts_s = 5.5 #23.require_chinese_en require_chinese_en = "https://www.londonmet.ac.uk/international/international-admissions/application-guidance-and-entry-criteria/academic-entry-requirements-by-country/non-eueea-countries/china/" #24.apply_proces_en apply_proces_en = '<p>Stage 1: choosing your course The first step for you as a new applicant is to choose the course you wish to undertake. If you have any questions at this stage you can contact our international recruitment team who will be happy to assist you and provide information about our courses. You can begin a conversation about a course you are interested in by emailing our recruitment team at: [email protected]. We often have representatives of London Metropolitan University visiting countries all around the world. You can find out the latest planned trips to see if we will be visiting near you, here: Meet us overseas Stage 2: applying for your course Once you have decided on your course you need to submit an application as soon as possible making sure you observe the international application deadlines. The method of application depends on the type of course you are applying for. The application methods available for each course are listed on the course page. You should check these details carefully to avoid any delay in your application reaching us.You should observe our international application guidance before submitting an application. Please see here: International application advice Stage 3: awaiting and responding to your offer Once the University receives your application you will receive a communication from us acknowledging this. You will also obtain your London Metropolitan University application ID and details about using, the applicant portal (Evision). At this point your application will enter the pending decision/consideration stage, and we will communicate with you again, either to request more information (such as a qualification transcript, portfolio, or piece of written work) for assessment, or to advise you of our decision.If you are successful in receiving an offer from us you will receive a communication detailing a conditional or unconditional offer, and this will contain further information and instruction. If your application is unsuccessful we will also contact you advising you of this, and our reasons for the decision. You can find out more about offers here: Information and advice for offer holders.Stage 4: Immigration and enrolment After obtaining an unconditional offer you will need to focus on making preparations to join the university and your arrangements to come to the UK (if you are not already here). You will receive further information about when and where to arrive, and how to attend your course enrolment closer to the enrolment period of your course.You should be considering your accommodation and finances as soon as possible before the start of term, and you should also be aware of, and be prepared to meet, any immigration requirements such as obtaining a student visa at the earliest opportunity. You can find a variety of information about moving to London here: Immigration and Arrival Advice: New Students.</p>' #26.tuition_fee_pre tuition_fee_pre = '£' item['apply_pre'] = apply_pre item['apply_documents_en'] = apply_documents_en item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type # item['degree_name'] = degree_name item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['tuition_fee'] = tuition_fee # item['location'] = location item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en # item['ielts'] = ielts # item['ielts_r'] = ielts_r # item['ielts_w'] = ielts_w # item['ielts_s'] = ielts_s # item['ielts_l'] = ielts_l item['require_chinese_en'] = require_chinese_en item['apply_proces_en'] = apply_proces_en item['tuition_fee_pre'] = tuition_fee_pre item['alevel'] = alevel item['ucascode'] = ucascode yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Hull' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath('//*[@id="main-content"]/header/div[2]/div[1]/h1|//*[@id="main-content"]/section[1]/div[2]/div/div/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en,url) #4.degree_type degree_type = 1 #5.degree_name degree_name = response.xpath('//*[@id="main-content"]/header/div[2]/div[1]/p[2]/span[2]').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name,url) #6.start_date start_date = '2019-9' #7.ucascode ucascode = response.xpath('//*[@id="main-content"]/header/div[2]/div[2]/div/div[3]/span').extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode).strip() ucascode = clear_space_str(ucascode) # print(ucascode,url) #8.apply_desc_en apply_desc_en = response.xpath('//*[@id="entry"]/div/div[1]').extract() apply_desc_en = ''.join(apply_desc_en) apply_desc_en = remove_class(apply_desc_en) # print(apply_desc_en) #9.overview_en overview_en = response.xpath('//*[@id="about"]/div/div[1]/p').extract() overview_en =''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #10.tuition_fee tuition_fee = response.xpath("//*[contains(text(),'Fees and funding')]//following-sibling::*").extract() tuition_fee = ''.join(tuition_fee) tuition_fee =getTuition_fee(tuition_fee) # print(tuition_fee) # #11.tuition_fee_pre tuition_fee_pre = '£' #12.modules_en # modules_en = response.xpath("//*[contains(text(),'odules')]/../following-sibling::*//li//p").extract() # if len(modules_en)==0: # modules_en = response.xpath('//*[@id="study"]//p/strong').extract() # modules_en = ''.join(modules_en) # modules_en = remove_class(modules_en) modules_en = response.xpath('//*[@id="study"]/div/div[1]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #13.ib ib = response.xpath("//*[contains(text(),'Alternative qualifications')]/../following-sibling::*//li[1]").extract() ib = ''.join(ib) ib = remove_tags(ib) # print(ib) #14.career_en career_en = response.xpath("//*[contains(text(),'Future prospects')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #15.require_chinese_en require_chinese_en = 'https://www.hull.ac.uk/choose-hull/study-at-hull/international/country/china.aspx' #16.ielts 17181920 ielts_list = response.xpath("//*[contains(text(),'International students')]//following-sibling::*").extract() ielts_list = ''.join(ielts_list) try: ielts= re.findall('\d\.\d',ielts_list) except: ielts = None if len(ielts) ==2: a = ielts[0] b = ielts[1] ielts = a ielts_l = b ielts_r = b ielts_s = b ielts_w = b else: ielts = 6.0 ielts_l = 5.5 ielts_r = 5.5 ielts_s = 5.5 ielts_w = 5.5 # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s) #21.duration # try: # ab = response.xpath("//div[@class='kis-widget']//@data-institution").extract()[0] # except: # ab = '' # try: # cd = response.xpath("//div[@class='kis-widget']//@data-course").extract()[0] # except: # cd = '' # if len(ab)!= 0: # duration_url = 'https://widget.unistats.ac.uk/Widget/'+str(ab)+'/'+str(cd)+'/small/en-GB/Full Time' # else:duration_url= '' # # print(duration_url) # if len(duration_url)!=0: # headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} # data = requests.get(duration_url, headers=headers) # response_duration = etree.HTML(data.text) # duration = response_duration.xpath('//*[@id="kisWidget"]/div[2]/p[1]//text()') # duration = ''.join(duration) # duration =remove_tags(duration) # try: # duration = re.findall(r'\d',duration)[0] # except: # duration = '' # else: # duration = '' # print(duration) #23.apply_pre apply_pre = '£' item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['start_date'] = start_date item['overview_en'] = overview_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['modules_en'] = modules_en item['ib'] = ib item['career_en'] = career_en item['require_chinese_en'] = require_chinese_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l # item['duration'] = duration item['apply_desc_en'] = apply_desc_en item['ucascode'] = ucascode yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Royal Agricultural University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="site"]//div[1]/div//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 1 #5.degree_name if '(Hons) ' in programme_en: programme_en = programme_en.replace('(Hons) ', '') degree_name = programme_en.split()[0] programme_en = programme_en.replace(degree_name, '').strip() # print(degree_name) # print(programme_en) #6.overview_en overview_en = response.xpath( '//*[@id="course-overview"]/div[1]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #7.ucascode ucascode = response.xpath( '//*[@id="site"]/div/main/div/div/div[2]/div/div/div/h3').extract( ) ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = clear_space_str(ucascode) # print(ucascode) #8.modules_en modules_en = response.xpath( "//*[contains(text(),'Modules')]//following-sibling::ul/li" ).extract() modules_en = '\n'.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #9.apply_desc_en apply_desc_en = response.xpath( '//*[@id="course-requirements"]/div[1]').extract() apply_desc_en = ''.join(apply_desc_en) apply_desc_en = remove_class(apply_desc_en) #10.tuition_fee tuition_fee = response.xpath( '//*[@id="course-fees"]/div[1]/table[1]/tbody/tr[1]/td[3]' ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #11.tuition_fee_pre tuition_fee_pre = '£' #12.career_en career_en = response.xpath( "//*[contains(text(),'Prospects')]//following-sibling::*").extract( ) career_en = ''.join(career_en) career_en = remove_class(career_en).strip() # print(career_en) #13.apply_proces_en apply_proces_en = response.xpath( "//*[contains(text(),'Apply now')]//following-sibling::div[1]" ).extract() apply_proces_en = ''.join(apply_proces_en) apply_proces_en = remove_class(apply_proces_en).strip() # print(apply_proces_en) #14.start_date start_date = '2018-9-1' #15.assessment_en assessment_en = '<p>During your undergraduate degree, you probably became familiar with many of the methods of delivery and study that we expect you to continue with during your postgraduate course. It is expected that you come already equipped with the basics in academic study, such as the ability to find, evaluate, manage, present and critique research or industry relevant output. There is a greater emphasis on independence and individual contribution towards the topics covered, and so the expectation is that students will actively participate in class-based activities from the outset. Giving presentations, critiquing case studies, using peer-to-peer feedback, working in groups on topical problems and justifying opinions based on the evidence is the norm for postgraduate study. It is not uncommon for students to arrive at a particular postgraduate qualification with very diverse backgrounds, qualifications and experience and we welcome these different perspectives in the classroom to bring a debate alive, however, it does require the student to take responsibility for their own subject knowledge gaps and motivate themselves to fill them. Of course, there will be support and guidance provided for good sources of information, however, it is not expected that these gaps will be specifically addressed within the taught sessions.For most postgraduate programmes group sizes are in the range of between 20 – 100 depending on the course and electives chosen (if relevant). However, alongside the lectures are small group seminars and tutorials where you will have the opportunity to explore key concepts in more detail, discuss topical issues relating to the key themes and undertake practical activities that help set the theories in context. To compliment the lectures and seminars, there may also be practical sessions, laboratory classes, off-site visits, case studies, guest speakers and field trips that are included in your timetabled activities depending on the modules you are studying.</p>' #16.deadline deadline = '2018-11,2019-5' #17.require_chinese_en require_chinese_en = '<p>International Foundation Year We run an International Foundation Year programme in partnership with our partner, INTO London World education Centre based in London. To enquire about the programme please get in touch with our admissions team: [email protected] Undergraduate Degrees (Bachelors) Senior Secondary School Graduation certificate 高中毕业证书 with overall grade B or higher (to include Maths) Plus Gao Kao – Chinese University/College Entrance examination (高考) with good grades OR completion of a recognised International Foundation course with overall grade 60% or above OR successful completion of 1 year of University degree with a minimum of 60%.And IELTS band score 6.0 overall or above with no less than 5.5 in each component of the academic IELTS test. (The test must have been taken within two years of the start of the course). =Academic transfers to RAU into Years 2 and 3 are possible. For more information contact [email protected]</p>' #18.ielts 19202122 ielts = 6.0 ielts_s = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_r = 5.5 #19.apply_pre apply_pre = '£' #20.alevel alevel = response.xpath( '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[1]').extract() alevel = ''.join(alevel) alevel = remove_class(alevel) # print(alevel) #21.duration duration = 3 #22.ib ib = response.xpath( '//*[@id="course-requirements"]/div[1]/div/ul[1]/li[4]').extract() ib = ''.join(ib) ib = remove_class(ib) # print(ib) item['ib'] = ib item['duration'] = duration item['alevel'] = alevel item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['overview_en'] = overview_en item['modules_en'] = modules_en item['apply_desc_en'] = apply_desc_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['career_en'] = career_en item['apply_proces_en'] = apply_proces_en item['start_date'] = start_date item['assessment_en'] = assessment_en item['deadline'] = deadline item['require_chinese_en'] = require_chinese_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['ucascode'] = ucascode yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Sheffield Hallam University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath("/html/body/section[1]//h1").extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = response.xpath( '/html/body/section[1]/div/div[2]/span').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #6.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'What is the fee?')]//following-sibling::*" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #7.tuition_fee_pre tuition_fee_pre = '£' #8.duration duration_list = response.xpath( "//*[contains(text(),'How long will I study?')]//following-sibling::*" ).extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) try: duration = re.findall('\d+', duration_list)[0] except: duration = 1 # print(duration_list) if int(duration) > 5: duration_per = 3 else: duration_per = 1 # print(duration,'*********',duration_per) #9.location location = 'Sheffield' #10.teach_time teach_time = response.xpath('/html/body/section[1]//span[1]').extract() teach_time = ''.join(teach_time) teach_time = remove_tags(teach_time) if 'Full-time' in teach_time: teach_time = 'Full-time' else: teach_time = 'Part-time' # print(teach_time) #11.overview_en overview_en = response.xpath( "//*[contains(text(),'Course summary')]//following-sibling::*" ).extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #12.career_en career_en = response.xpath( "//*[contains(text(),'Future careers')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #13.rntry_requirements rntry_requirements = response.xpath( '//*[@id="entry-requirements"]/div').extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #14.modules_en modules_en = response.xpath( "//*[contains(text(),'Compulsory modules')]/../following-sibling::*" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #15.apply_proces_en apply_proces_en = response.xpath( '//*[@id="apply-now"]/div[1]//a/@href').extract() apply_proces_en = ''.join(apply_proces_en) # print(apply_proces_en) #16.duration_per duration_per = 1 #17.ielts 18192021 ielts_list = re.findall(r'[567]\.\d', rntry_requirements) # print(ielts_list,response.url) if len(ielts_list) != 0: a = ielts_list[0] b = ielts_list[1] ielts = a ielts_r = b ielts_l = b ielts_s = b ielts_w = b else: ielts = 6.5 ielts_r = 6.0 ielts_l = 6.0 ielts_s = 6.0 ielts_w = 6.0 #22.require_chinese_en require_chinese_en = '<p>The following qualifications from China will be considered for entry on to postgraduate taught programmes, with a usual minimum average of 60 per cent Four year Bachelor Degree from a recognised university Three year university diploma plus relevant work experience Successful completion of a recognised pre-masters course</p>' #23.apply_fre apply_pre = '£' #24.start_date start_date = response.xpath( "//*[contains(text(),'When do I start?')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) # print(start_date) if 'September, January' in start_date: start_date = '2018-9,2019-1' elif 'January' in start_date: start_date = '2019-1' else: start_date = translate_month(start_date) start_date = '2018-' + str(start_date) # print(start_date) item['start_date'] = start_date item['apply_pre'] = apply_pre item['require_chinese_en'] = require_chinese_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['duration'] = duration item['location'] = location item['teach_time'] = teach_time item['overview_en'] = overview_en item['career_en'] = career_en item['rntry_requirements'] = rntry_requirements item['modules_en'] = modules_en item['apply_proces_en'] = apply_proces_en item['duration_per'] = duration_per yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Bournemouth University' # print(university) #2.location location = response.xpath( "//*[contains(text(),'Location:')]//following-sibling::p").extract( ) location = ''.join(location) location = remove_tags(location) # print(location) #3.programme_en 4.degree_name programme_en = response.xpath('/html/body/div/section//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) try: degree_name = programme_en.split()[0] except: degree_name = '' if '-' in programme_en: programme_en = programme_en.replace('-', '') programme_en = programme_en.replace(degree_name, '') programme_en = clear_space_str(programme_en) if '–' in programme_en: programme_en = programme_en.replace('–', '').strip() programme_en = programme_en.replace('&', '') # print('programme_en:',programme_en) # print('degree_name:',degree_name) # 5.degree_type degree_type = 2 #6.teach_time teach_time = response.xpath( "//*[contains(text(),'Delivery:')]//following-sibling::*").extract( ) teach_time = ''.join(teach_time) teach_time = remove_tags(teach_time) if 'Full time' in teach_time: teach_time = 'Full time' else: teach_time = 'Part time' # print(teach_time) #7.duration #8.duration_per duration = response.xpath( "//*[contains(text(),'Duration:')]//following-sibling::p").extract( ) duration = ''.join(duration) duration = remove_tags(duration) # print(duration) if '1 year' in duration: duration = 1 duration_per = 1 elif '12-18 months' in duration: duration = 12 duration_per = 3 elif '36 months' in duration: duration = 36 duration_per = 3 elif '1 to 2 years' in duration: duration = 1 duration_per = 1 elif '2 years' in duration: duration = 2 duration_per = 1 elif '3-5 years' in duration: duration = 3 duration_per = 1 elif '48 months' in duration: duration = 48 duration_per = 3 elif '18-36 months' in duration: duration = 18 duration_per = 3 elif '12 months' in duration: duration = 12 duration_per = 3 elif '5 years' in duration: duration = 5 duration_per = 1 elif '3 years' in duration: duration = 3 duration_per = 1 elif '14 months' in duration: duration = 14 duration_per = 3 elif '15 months' in duration: duration = 15 duration_per = 3 elif '18-24 months' in duration: duration = 18 duration_per = 3 elif '27 months' in duration: duration = 27 duration_per = 3 elif '8 months' in duration: duration = 8 duration_per = 3 elif 'Nine months' in duration: duration = 9 duration_per = 3 else: duration_per = 1 duration = 1 # print('duration_per:',duration_per) # print('duration:',duration) #9.overview_en overview_en = response.xpath( '//*[@id="main-content"]/div/section[2]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #10.teach_time teach_time = 'full time' #11.modules_en modules_en = response.xpath( "//section[@id='course-details']//div[@id='accordion-1']").extract( ) modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #12.start_date start_date = response.xpath( "//*[contains(text(),'Next start date:')]//following-sibling::p" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #13.rntry_requirements rntry_requirements = response.xpath( "//*[contains(text(),'Entry requirements')]/../following-sibling::div[1]" ).extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) rntry_requirements = clear_space_str(rntry_requirements) # print(rntry_requirements,'******************************************************************************') #14.ielts 15.16.17.18 ielts_list = re.findall('\d\.\d', rntry_requirements) # print(ielts_list) if len(ielts_list) == 4: ielts = ielts_list[2] ielts_l = ielts_list[3] ielts_s = ielts_list[3] ielts_r = ielts_list[3] ielts_w = ielts_list[3] elif len(ielts_list) == 3: ielts = ielts_list[1] ielts_l = ielts_list[2] ielts_s = ielts_list[2] ielts_r = ielts_list[2] ielts_w = ielts_list[2] elif len(ielts_list) == 2: ielts = ielts_list[0] ielts_l = ielts_list[1] ielts_s = ielts_list[1] ielts_r = ielts_list[1] ielts_w = ielts_list[1] elif len(ielts_list) == 1: ielts = ielts_list[0] ielts_l = ielts_list[0] ielts_s = ielts_list[0] ielts_r = ielts_list[0] ielts_w = ielts_list[0] else: ielts = None ielts_l = None ielts_s = None ielts_r = None ielts_w = None # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s) #19.career_en career_en = response.xpath( "//*[contains(text(),'Careers')]/../following-sibling::*|//*[contains(text(),'Careers')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #20.tuition_fee,#21.tuition_fee_pre tuition_fee_list = response.xpath( '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]' ).extract() tuition_fee_list = ''.join(tuition_fee_list) # # if len(tuition_fee) == 0: # tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract() # tuition_fee = ''.join(tuition_fee) # tuition_fee = remove_tags(tuition_fee) # tuition_fee = tuition_fee.replace(',','') # tuition_fee = tuition_fee.replace('£','') # print(tuition_fee) tuition_fee = getTuition_fee(tuition_fee_list) # print(tuition_fee) tuition_fee_pre = '£' #22.url url = response.url # print(url) #23.application_open_date application_open_date = '2018-7-18' #24.apply_pre apply_pre = '£' #25.apply_fee apply_fee = 0 #26.apply_proces_en apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>' #27.require_chinese_en require_chinese_en = "<p>This is a guide to the normal entry requirements, assuming you’ve followed the Chinese education system. An admissions tutor will study your application, so make sure you include your academic background and personal information when you apply.Entry requirements vary depending on what sort of course you’re coming to BU to study. BU International College Foundation Certificates You can undertake a Foundation Certificate before going on to an undergraduate course if you’ve completed 11 years of schooling or Senior High School Year 2 in China and have a minimum of IELTS (Academic) 5.0. Undergraduate courses You can apply to study a Bachelor's degree from year one if you hold a Chinese Senior High School Diploma plus successful completion of a relevant first-year undergraduate programme in a recognised Chinese university, or a Diploma from Specialized College (zhongzhuan). Chinese Senior High School certificate of graduation with overall HuiKao result grade B average, transcripts of 3 years with 85% average (85% also eligible for AES). Top-up courses You need to hold a College Graduation Diploma (Dazhuan awarded by a university/college on completion of two to three years of study), or a BTEC Higher National Diploma or Foundation degree in a relevant subject.Postgraduate courses You need to have a Bachelor's (Honours) degree from a recognised Chinese university, normally from a four-year undergraduate programme, or a Bachelors degree from Higher Education Self-Study Examinations, or a Top-up degree or university-recognised Pre-Master’s Foundation programme. Grade requirements from Chinese Bachelor's degree holders are as below: Applicants from 985 or 211 universities Media studies and other subjects equivalent to UK 2:1 degree 65% + GPA 2.25 + Business and subjects equivalent to UK 2:2 degree 60% + GPA 2.0 + Academic Excellence Scholarship (automatic award of £3500) 75% + GPA 2.75 + Applicants from other universities Media studies and other subjects equivalent to UK 2:1 degree 70% + GPA 2.5 + Business and subjects equivalent to UK 2:2 degree 65% + GPA 2.25 + Academic Excellence Scholarship (automatic award of £3500) 80% + GPA 3.0 + Research programmes You need a good postgraduate degree to be considered for a BU research programme. Please see more detail on the postgraduate research page.You can find more information about English language requirements for entry to BU on our English language requirements page. Full information about preparatory courses is available on the Bournemouth University International College website.If you need help with your visa or want more information about the immigration process, you can find it on our immigration information page.</p>" item['require_chinese_en'] = require_chinese_en item['apply_proces_en'] = apply_proces_en item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['university'] = university item['location'] = location item['programme_en'] = programme_en item['degree_name'] = degree_name item['degree_type'] = degree_type item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en item['teach_time'] = teach_time item['modules_en'] = modules_en item['start_date'] = start_date item['rntry_requirements'] = rntry_requirements item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['url'] = url item['application_open_date'] = application_open_date yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Swinburne University of Technology' # print(university) #2.url url = response.url # print(url) #3.department try: department = response.xpath( '//*[@id="content"]/main/section[1]/div[2]/div/comment()' ).extract() department = ''.join(department).replace('Faculty', '') department = clear_space_str(department) department = remove_tags(department).replace(' -->', '').strip() department = 'Faculty ' + department # print(department) except: department = 'N/A' #4.programme_en try: programme_en = response.xpath( '//*[@id="content"]/main/section[1]/header/div[1]/h1').extract( )[0] programme_en = remove_tags(programme_en) programme_en = programme_en.replace('Master of ', '').strip() except: programme_en = 'N/A' if '(International)' in programme_en: programme_en = programme_en elif ' (Professional)' in programme_en: programme_en = programme_en elif '(Advanced)' in programme_en: programme_en = programme_en elif '(Executive)' in programme_en: programme_en = programme_en else: if '(' in programme_en: programme_en = re.findall(r'\((.*)\)', programme_en)[0] else: programme_en = programme_en # print(programme_en) #5.degree_name degree_name = response.xpath( '//*[@id="content"]/main/section[1]/header/div[1]/h1').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #6.start_date start_date = '2,7' #7.degree_overview_en try: degree_description = response.xpath( '//*[@id="content"]/main/section[1]/div[2]/div[1]').extract() degree_description = ''.join(degree_description) degree_description = remove_class(degree_description) degree_overview_en = degree_description # print(degree_description) except: degree_overview_en = 'N/A' #8.apply_pre apply_pre = 'A$' #9.duration try: duration = response.xpath( '//h3[contains(text(),"Duration")]/following-sibling::p' ).extract()[0] duration = remove_tags(duration) # duration=re.findall('\d\.?\d?',duration) # duration=''.join(duration) # print(duration) except: duration = 'N/A' # print(duration) # 10.modules_en try: modules_en = response.xpath( '//h3[contains(text(),"Course description")]/following-sibling::div' ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) except: modules_en = 'N/A' # print(modules_en) # 11.career_en try: career_en = response.xpath( '//h3[contains(text(),"Career")]/following-sibling::div[1]//text()' ).extract() career_en = ''.join(career_en).strip() career_en = '<p>' + career_en + '</p>' # except: career_en = '' # print(career_en) # 12.tuition_fee try: tuition_fee = response.xpath( '//h3[contains(text(),"Fee")]/following-sibling::p').extract( )[0] tuition_fee = remove_tags(tuition_fee) tuition_fee = re.findall('\$\d{4,6}', tuition_fee) tuition_fee = ''.join(tuition_fee).replace('$', '') # print(tuition_fee,response.url) except: tuition_fee = 0 #13.rntry_requirements_en try: rntry_requirements_en = response.xpath( '//h3[contains(text(),"Entry requirements")]/following-sibling::div' ).extract()[0] rntry_requirements_en = remove_class(rntry_requirements_en) # except: rntry_requirements_en = 'N/A' # print(rntry_requirements_en) # 14.ielts 15161718 19.toefl 20212223 ielts_text = response.xpath( '//h3[contains(text(),"English language requirements")]/following-sibling::div' ).extract() ielts_text = ''.join(ielts_text) # print(ielts_text) ielts = re.findall('[567]\.\d', ielts_text) ielts = ''.join(ielts) toefl = re.findall( 'score of [6-9]\d[\sa-zA-Z\,]*[0-2]\d[\sa-zA-Z]*[0-2]?\d?', ielts_text) toefl = ''.join(toefl) # print(ielts) toefls = re.findall('\d{2}', toefl) # print(toefls) if len(toefls) == 3: toefl = toefls[0] toefl_r = toefls[1] toefl_w = toefls[1] toefl_s = toefls[2] toefl_l = toefls[2] else: toefl_r = None toefl_w = None toefl_s = None toefl_l = None ieltss = re.findall('\d.\d', ielts) if ieltss: ielts = max(ieltss) ielts_l, ielts_s, ielts_r, ielts_w = min(ieltss), min(ieltss), min( ieltss), min(ieltss) else: ielts_l, ielts_s, ielts_r, ielts_w = '', '', '', '' # 检查后面的托福成绩toefl =response.xpath('//h3[contains(text(),"English language requirements")]/following-sibling::div').extract() # print(toefl,toefl_r,toefl_w,toefl_s,toefl_l) # print(ielts,ielts_w,ielts_r,ielts_s,ielts_l) # ielts = '' # toefl = '' # ielts_l, ielts_s, ielts_R, ielts_w ='','','','' # toefl_r = '' # toefl_w = '' #24.apply_proces_en apply_proces_en = [ "Before you start your application, make sure you have followed these important steps. After checking these details, you will be ready to start your application to study at Swinburne.", "You can also read about the Australian Government’s Education Services for Overseas Students (ESOS) regulatory framework so that you understand your rights and responsibilities as an international student before and during your study.", "1. Check that you are an international student", "2. Select your course", "3. Check entry requirements", "4. Check to see if you are eligible for credit", "5. Review tuition fees", "6. Compile education and employment history", "7. Prepare your documents", "8. Begin your application" ] apply_proces_en = ''.join(apply_proces_en) apply_proces_en = '<p>' + apply_proces_en + '</p>' # 25.apply_desc_en apply_desc_en = [ "You may also be required to submit documents to support your application.", "certified academic documents", "certified copy of your passport", "English proficiency test results", "certified copy of unit outlines and academic transcripts", "portfolio (for most design courses)", "English translations of all documents, if not already in English" ] apply_desc_en = ''.join(apply_desc_en) apply_desc_en = '<p>' + apply_desc_en + '</p>' #26.url url = response.url #27.location location = 'Hawthorn' #28.tuition_fee_pre tuition_fee_pre = 'A$' #29.degree_type degree_type = 2 item['tuition_fee_pre'] = tuition_fee_pre item['degree_type'] = degree_type item['university'] = university item['url'] = url item['department'] = department item['programme_en'] = programme_en item['degree_name'] = degree_name item['start_date'] = start_date item['degree_overview_en'] = degree_overview_en item['duration'] = duration item['apply_pre'] = apply_pre item['modules_en'] = modules_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['rntry_requirements_en'] = rntry_requirements_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['toefl_s'] = toefl_s item['toefl_l'] = toefl_l item['apply_proces_en'] = apply_proces_en item['apply_desc_en'] = apply_desc_en item['url'] = url item['location'] = location yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Bournemouth University' # print(university) #2.location location = response.xpath( "//*[contains(text(),'Location:')]//following-sibling::p").extract( ) location = ''.join(location) location = remove_tags(location) # print(location) #3.programme_en 4.degree_name programme_en = response.xpath('/html/body/div/section//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) try: degree_name = programme_en.split()[0] except: degree_name = '' if '-' in programme_en: programme_en = programme_en.replace('-', '') programme_en = programme_en.replace(degree_name, '') programme_en = clear_space_str(programme_en) if '–' in programme_en: programme_en = programme_en.replace('–', '').strip() programme_en = programme_en.replace('&', '').replace('(Hons)', '').strip() # print('programme_en:',programme_en) # print('degree_name:',degree_name) #5.degree_type degree_type = 1 #6.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS Code:')]//following-sibling::*" ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) # print(ucascode) #7.duration #8.duration_per duration_a = response.xpath( "//*[contains(text(),'Duration:')]//following-sibling::p").extract( ) duration_a = ''.join(duration_a) duration_a = remove_tags(duration_a) # print(duration) if 'Four years' in duration_a: duration = 4 duration_per = 1 else: duration = re.findall('\d', duration_a)[0] duration_per = 1 # print('duration_per:',duration_per) # print('duration:',duration) #9.overview_en overview_en = response.xpath( '//*[@id="main-content"]/div/section[3]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #10.alevel try: alevel_list = response.xpath( "//*[contains(text(),'GCSEs')]//preceding-sibling::p").extract( )[1] alevel = ''.join(alevel_list) alevel = remove_tags(alevel) except: alevel = 'N/A' # print(alevel) #11.modules_en modules_en = response.xpath( "//section[@id='course-details']//div[@id='accordion-1']").extract( ) modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #12.start_date start_date = response.xpath( "//*[contains(text(),'Next start date:')]//following-sibling::p" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #13.ib ib = response.xpath( "//*[contains(text(),'International Baccalaureate')]/..").extract( ) ib = ''.join(ib) ib = remove_tags(ib) if len(ib) > 500: ib = ib[:500] # print(ib) #14.ielts 15.16.17.18 rntry_requirements = response.xpath( '//*[@id="entry-requirements"]/div').extract() rntry_requirements = ''.join(rntry_requirements) ielts_list = re.findall('\d\.\d', rntry_requirements) # print(ielts_list) if len(ielts_list) == 4: ielts = ielts_list[2] ielts_l = ielts_list[3] ielts_s = ielts_list[3] ielts_r = ielts_list[3] ielts_w = ielts_list[3] elif len(ielts_list) == 3: ielts = ielts_list[1] ielts_l = ielts_list[2] ielts_s = ielts_list[2] ielts_r = ielts_list[2] ielts_w = ielts_list[2] elif len(ielts_list) == 2: ielts = ielts_list[0] ielts_l = ielts_list[1] ielts_s = ielts_list[1] ielts_r = ielts_list[1] ielts_w = ielts_list[1] elif len(ielts_list) == 1: ielts = ielts_list[0] ielts_l = ielts_list[0] ielts_s = ielts_list[0] ielts_r = ielts_list[0] ielts_w = ielts_list[0] else: ielts = None ielts_l = None ielts_s = None ielts_r = None ielts_w = None # print(ielts,ielts_l,ielts_r,ielts_w,ielts_s) #19.career_en career_en = response.xpath( "//*[contains(text(),'Careers')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #20.tuition_fee,#21.tuition_fee_pre tuition_fee_list = response.xpath( '//*[@id="fees-box"]/div/div/span|//*[@id="fees-box"]/div[2]/div[2]/p[2]|//*[@id="fees-box"]/div/div[2]/ul/li[1]' ).extract() tuition_fee_list = ''.join(tuition_fee_list) # # if len(tuition_fee) == 0: # tuition_fee = response.xpath('//*[@id="fees-box"]/div/div[1]/span[1]').extract() # tuition_fee = ''.join(tuition_fee) # tuition_fee = remove_tags(tuition_fee) # tuition_fee = tuition_fee.replace(',','') # tuition_fee = tuition_fee.replace('£','') # print(tuition_fee) tuition_fee = getTuition_fee(tuition_fee_list) # print(tuition_fee) tuition_fee_pre = '£' #22.url url = response.url # print(url) #23.application_open_date application_open_date = '2018-7-18' #24.apply_pre apply_pre = '£' #25.apply_fee apply_fee = 0 #26.apply_proces_en apply_proces_en = '<p>Step 1: Application Complete all sections of your country’s application form. Step 2: Terms and conditions You must read, understand and agree to be bound by the terms and conditions before moving on to the next step. Step 3: Confirmation Sign the application form to confirm you have provided correct details and you agree to the terms and conditions. Step 4: Other documents Scan, attach and send in your additional documents to the email address on the application form: Academic transcripts and exam results English test score if required If you do not have academic transcripts or English test results, you can still apply and we can make you a conditional offer, the conditions of which you will need to satisfy before we confirm your place. Step 5: Assessment We’ll contact you to arrange one or more of the following if required: English language test Mathematics test Interview This will allow us to further assess your suitability for the program. Step 6: Admission decision Receive an admission decision and, if your application is successful, accompanying offer letter. Step 7: Deposit Accept your offer by paying the deposit. Step 8: Pre-arrival Receive confirmation of program acceptance, pre-arrival information, plus guidance on finding local accommodation if you are coming from outside the USPP host city to study. For conditional offers, these items are issued once we receive proof that the conditions of the offer have been met. Step 9: Travel Arrange travel to USPP location if applicable.Arrive at your USPP center for orientation before classes begin. Step 10: Begin your USPP program USPP teaching begins. View the program timeline for next steps.</p>' #27.assessment_en assessment_en = response.xpath( "//*[contains(text(),'How you will be assessed')]//following-sibling::p|//*[@id='accordion-1']/div[6]" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en,url) item['assessment_en'] = assessment_en item['alevel'] = alevel item['ib'] = ib item['ucascode'] = ucascode item['apply_proces_en'] = apply_proces_en item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['university'] = university item['location'] = location item['programme_en'] = programme_en item['degree_name'] = degree_name item['degree_type'] = degree_type item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en item['modules_en'] = modules_en item['start_date'] = start_date item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['url'] = url item['application_open_date'] = application_open_date yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'The University of Adelaide' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en_1 = response.xpath('//*[@id="ua-main-content"]/h2/text()').extract() programme_en_1 = ''.join(programme_en_1) programme_en = remove_tags(programme_en_1).replace('Master of ','') if '(' in programme_en: programme_en = re.findall(r'\((.*)\)',programme_en)[0] # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = programme_en_1.split('(')[0].strip() if '(' in programme_en_1 else programme_en_1.strip() # print(degree_name) #6.teach_time teach_time = 'coursework' #7.duration #8.duration_per duration_list = response.xpath('//*[@id="ua-main-content"]/div[2]/div[3]/span[2]').extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) duration_list = clear_space_str(duration_list) # print(duration_list) if '1.5'in duration_list: duration = 1.5 else: try: duration = re.findall('\d',duration_list)[0] except: duration = None duration_per = 1 # print(duration) # print(duration_list) #9.location location = response.xpath('//*[@id="ua-main-content"]/div[2]/div[1]/span[2]/a').extract() location = ''.join(location) location = remove_tags(location) # print(location) if '2019/hd' in response.url: location = 'North Terrace Campus' elif len(location) ==0: location = '' # print(location) #10.overview_en overview_en = response.xpath('//*[@id="ua-main-content"]/div[2]/div/div').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #11.ielts 12131415 ielts_list = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[2]//tr[2]/td/table//tr/td').extract() # ielts_list = ''.join(ielts_list) # ielts_list = remove_tags(ielts_list) # print(ielts_list) #ielts try: if '7' in ielts_list[1]: ielts = 7 else: try: ielts = re.findall('\d\.\d',ielts_list[1])[0] except: ielts = None except: ielts = 7 #ielts_r try: if '6.5' in ielts_list[2]: ielts_r = 6.5 else: try: ielts_r = re.findall('\d',ielts_list[2])[0] except: ielts_r = None except: ielts_r = 6.5 # print(ielts_r) #ielts_l try: if '6.5' in ielts_list[3]: ielts_l = 6.5 else: try: ielts_l = re.findall('\d', ielts_list[3])[0] except: ielts_l = None except: ielts_l = 6.5 # print(ielts_l) # ielts_s try: if '6.5' in ielts_list[4]: ielts_s = 6.5 else: try: ielts_s = re.findall('\d', ielts_list[4])[0] except: ielts_s = None except: ielts_s = 6.5 # print(ielts_s) # ielts_w try: if '6.5' in ielts_list[5]: ielts_w = 6.5 else: try: ielts_w = re.findall('\d', ielts_list[5])[0] except: ielts_w = None except: ielts_w = 6.5 # print(ielts_w) #16.toefl 17181920 toefl_list = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[2]//tr[3]/td/table//tr/td').extract() toefl_list = ''.join(toefl_list) toefl_list = remove_tags(toefl_list) # print(toefl_list) try: toefl = re.findall('\d+',toefl_list) # print(toefl) a = toefl[0] b = toefl[1] c = toefl[2] d = toefl[3] e = toefl[4] toefl = a toefl_r = b toefl_l = c toefl_s = d toefl_w = e except: toefl = 94 toefl_r = 24 toefl_l = 24 toefl_s = 23 toefl_w = 27 # print(toefl, toefl_r, toefl_l, toefl_s, toefl_w,response.url) #21.rntry_requirements_en rntry_requirements_en = response.xpath('//*[@id="df-acc-admission"]/div[5]/table[3]//tr/td').extract() rntry_requirements_en = ''.join(rntry_requirements_en) rntry_requirements_en = remove_class(rntry_requirements_en) # print(rntry_requirements_en) #22.apply_proces_en apply_proces_en ='https://international.adelaide.edu.au/admissions/how-to-apply' #23.deadline if 'Master of Psychology' in programme_en: deadline = '2018-10-21,2019-5-1' elif 'Master of Viticulture and Oenology' in programme_en: deadline = '2018-12-1,2019-4-30' else: deadline = '2018-12-1,2019-5-1' #24.tuition_fee tuition_fee = response.xpath('//*[@id="df-acc-fees_scholarships"]/div[5]/table//tr/td[2]').extract() tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #25.tuition_fee_pre tuition_fee_pre = '$' #26.apply_pre apply_pre = '$' #27.career_en career_en = response.xpath('//*[@id="df-acc-careers_parent"]//following-sibling::*').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #28.modules_en modules_en = response.xpath("//h4[contains(text(),'Example Study Plan')]/following-sibling::div[1]").extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['location'] = location item['overview_en'] = overview_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['toefl_l'] = toefl_l item['rntry_requirements_en'] = rntry_requirements_en item['apply_proces_en'] = apply_proces_en item['deadline'] = deadline item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['career_en'] = career_en item['apply_pre'] = apply_pre item['modules_en'] = modules_en yield item
def parse_rntry_requirements(self, entry_requirements_url): data = requests.get(entry_requirements_url, headers=self.headers) response = etree.HTML(data.text) # print(response) entry_requirements = response.xpath( "//div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']" ) entry = etree.tostring(entry_requirements[0], encoding='unicode', pretty_print=False, method='html') # print("************", assessment_en) # print(ass) entry = remove_class(clear_space_str(entry)) # print(ass) english_dict = {} # 获取雅思托福分数 ielts = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Total')]/following-sibling::*[1]//text()" ) ielts_l = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Listening')]/following-sibling::*[1]//text()" ) ielts_s = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Speaking')]/following-sibling::*[1]//text()" ) ielts_r = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Reading')]/following-sibling::*[1]//text()" ) ielts_w = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Writing')]/following-sibling::*[1]//text()" ) english_dict['IELTS'] = ''.join(ielts) english_dict['IELTS_L'] = ''.join(ielts_l) english_dict['IELTS_S'] = ''.join(ielts_s) english_dict['IELTS_R'] = ''.join(ielts_r) english_dict['IELTS_W'] = ''.join(ielts_w) toefl = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Total')]/following-sibling::*[1]//text()" ) toefl_l = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Listening')]/following-sibling::*[1]//text()" ) toefl_s = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Speaking')]/following-sibling::*[1]//text()" ) toefl_r = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Reading')]/following-sibling::*[1]//text()" ) toefl_w = response.xpath( "//div[@class='content campl-content-container']/div[@class='field field-name-field-gao-course-requirements field-type-text-long field-label-hidden']/div[@class='field-items']/div[@class='field-item even']/div[1]/div[1]/table[1]/tbody[1]//th[contains(text(),'Writing')]/following-sibling::*[1]//text()" ) english_dict['TOEFL'] = ''.join(toefl) english_dict['TOEFL_L'] = ''.join(toefl_l) english_dict['TOEFL_S'] = ''.join(toefl_s) english_dict['TOEFL_R'] = ''.join(toefl_r) english_dict['TOEFL_W'] = ''.join(toefl_w) # print(english_dict) english_dict['entry'] = entry return english_dict
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Royal Holloway University of London' # print(university) #2.department try: department = response.xpath('//*[@id="main"]/aside/div[2]/a/div[2]/span[2]').extract() department = ''.join(department) department = remove_tags(department) # print(department) except: department = 'N/A' #3.location location = 'London' #4.degree_type degree_type = 2 #5.degree_name try: degree_name = response.xpath('/html/body/div[1]/main/div[1]/div/div/div/span').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) except: degree_name = 'N/A' # print(degree_name) #6.programme_en try: programme_en = response.xpath('/html/body/div[1]/main/div[1]/div/div/div/h2').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) programme_en = clear_space_str(programme_en) except: programme_en = '' # print(programme_en) #7.overview_en try: overview_en = response.xpath('//*[@id="main"]/article/p[1]').extract() overview_en = ''.join(overview_en) # overview_en = remove_tags(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) except: overview_en = '' #8.duration try: duration = response.xpath('/html/body/div[1]/main/div[2]/div/ul/li[1]/span').extract() duration = ''.join(duration) duration = re.findall('\d',duration)[0] except: duration = '' # print(duration) #9.duration_per duration_per = 1 #10.modules_en try: modules_en = response.xpath('//*[@id="accordionItem1"]/div').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) except: modules_en = '' # print(modules_en) #11.assessment_en try: assessment_en = response.xpath('//*[@id="accordionItem2"]/div').extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) assessment_en = clear_space_str(assessment_en) except: assessment_en = '' # print(assessment_en) #12.career_en try: career_en = response.xpath('//*[@id="accordionItem4"]/div').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) except: career_en = '' # print(career_en) #13.tuition_fee try: tuition_fee = response.xpath('//*[@id="accordionItem5"]/div/p[2]').extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = re.findall('\£(\d+)', tuition_fee)[0] except: tuition_fee = 'N/A' # print(tuition_fee) #14.tuition_fee_pre tuition_fee_pre = '£' #15.rntry_requirements rntry_requirements = response.xpath('//*[@id="accordionItem3"]/div').extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_tags(rntry_requirements) rntry_requirements = clear_space_str(rntry_requirements) # print(rntry_requirements) #16.ielts if 'Classics' in programme_en: ielts = 6.5 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'theatre' in programme_en: ielts = 6.5 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'English' in programme_en: ielts = 7 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'European Studies' in programme_en: ielts = 6.5 ielts_w = 6.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'History' in programme_en: ielts = 6.5 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Media Arts' in programme_en: ielts = 6.5 ielts_w = 6.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Music' in programme_en: ielts = 6.5 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Economics' in programme_en: ielts = 6.5 ielts_w = 6 ielts_r = 6 ielts_l = 6 ielts_s = 6 elif 'MBA' in programme_en: ielts = 7 ielts_w = 6 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Management' in programme_en: ielts = 6.5 ielts_w = 6 ielts_r = 6 ielts_l = 6 ielts_s = 6 elif 'Biological Sciences' in programme_en: ielts = 6.5 ielts_w = 7 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Electronic Engineering' in programme_en: ielts = 6.5 ielts_w = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Physics' in programme_en: ielts = 6.5 ielts_w = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 elif 'Psychology' in programme_en: ielts = 6.5 ielts_w = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 else: ielts = 6.5 ielts_w = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 #21.require_chinese_en require_chinese_en ='' #22.url url = response.url #23.other other = 'https://intranet.royalholloway.ac.uk/international/documents/pdf/internationalstudentsupport/tier-4-checklist-outside-uk.pdf' #24.apply_proces_en apply_proces_en = 'https://admissions.royalholloway.ac.uk/AP/Login.aspx' #25.teach_time teach_time = 'Full-time' item['teach_time'] = teach_time item['other'] = other item['apply_proces_en'] =apply_proces_en item['university'] = university item['department'] = department item['location'] = location item['degree_type'] = degree_type item['degree_name'] = degree_name item['programme_en'] = programme_en item['overview_en'] = overview_en item['duration'] = duration item['duration_per'] = duration_per item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['rntry_requirements'] = rntry_requirements item['ielts'] = ielts item['ielts_w'] = ielts_w item['ielts_r'] = ielts_r item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['require_chinese_en'] = require_chinese_en item['url'] = url yield item
def parse(self, response): item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Cumbria' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '/html/body/main/div[1]/header/div/h1/text()').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) programme_en = clear_space_str(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = response.xpath( '/html/body/main/div[1]/header/div/h1/em').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #6.location location = response.xpath( "//*[contains(text(),'Location')]//following-sibling::*").extract( ) location = ''.join(location) location = remove_tags(location) # print(location) #7.duration #8.duration_per #9.teach_time duration_list = response.xpath( "//*[contains(text(),'Duration')]//following-sibling::*").extract( ) duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) try: duration = re.findall('\d+', duration_list)[0] except: duration = 1 if int(duration) > 5: duration_per = 3 else: duration_per = 1 if 'Full' in duration_list: teach_time = 'Full-time' else: teach_time = 'Part-time' # print(duration,teach_time,duration_per) #10.start_date start_date = response.xpath( "//*[contains(text(),'Start date')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) if 'Various' in start_date: start_date = '2018-*' elif 'September, November 2018' in start_date: start_date = '2018-9,2018-11' elif 'September 2018, March 2019' in start_date: start_date = '2018-9,2019-3' elif 'September 2018; January 2018' in start_date: start_date = '2018-9,2019-1' elif 'April, September 2018' in start_date: start_date = '2018-9,2019-4' elif 'January, May, October 2018' in start_date: start_date = '2018-10,2019-1,2019-5' elif 'January, April, July, October 2018' in start_date: start_date = '2018-7,2019-1,2019-4,2019-7' elif 'September 2018; March 2018' in start_date: start_date = '2018-9,2019-3' elif 'June, September 2018' in start_date: start_date = '2018-6,2018-9' elif 'January, April or September 2018' in start_date: start_date = '2018-9,2019-1,2019-4' elif 'January, April, September 2018' in start_date: start_date = '2018-9,2019-1,2019-4' elif 'January 2018; September 2018' in start_date: start_date = '2018-9' elif 'October 2018; May, January 2018' in start_date: start_date = '2018-10,2019-1,2019-5' elif 'September 2018; January 2018 ' in start_date: start_date = '2018-9,2019-1' elif 'January, May, September 2018, 2019' in start_date: start_date = '2018-9,2019-1,2019-5' else: start_date = translate_month(start_date) start_date = '2018-' + str(start_date) # print(start_date) #11.modules_en modules_en = response.xpath( "//h3[contains(text(),'Modules')]//following-sibling::*[1]" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en).strip() # print(modules_en) #12.rntry_requirements rntry_requirements = response.xpath( "//h3[contains(text(),'Selection criteria')]//following-sibling::*" ).extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #13.tuition_fee_pre tuition_fee_pre = '£' #14.other other = 'https://www.cumbria.ac.uk/media/university-of-cumbria-website/content-assets/public/finance/documents/studentfinance/fees/postgraduate-taught-tuition-fees-2018-19.pdf' #15.ielts 16171819 if 'Occupational Therapy' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 ielts_w = 6.5 elif 'Physiotherapy' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 ielts_w = 6.5 elif 'Social Work' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 ielts_w = 6.5 else: ielts = 6.5 ielts_r = 5.5 ielts_s = 5.5 ielts_l = 5.5 ielts_w = 5.5 #20.require_chinese_en require_chinese_en = '<p>Bachelor’s degree or equivalent.English Language: IELTS 6.5 with at least 6.0 in each section (or equivalent).</p>' #21.apply_pre apply_pre = '£' item['apply_pre'] = apply_pre item['require_chinese_en'] = require_chinese_en item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['location'] = location item['duration'] = duration item['duration_per'] = duration_per item['teach_time'] = teach_time item['start_date'] = start_date item['modules_en'] = modules_en item['tuition_fee_pre'] = tuition_fee_pre item['rntry_requirements'] = rntry_requirements item['other'] = other item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Leeds' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="main"]/div/header/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name_list = programme_en.split() degree_name = degree_name_list[-1] programme_en = programme_en.replace(degree_name, '').strip() # print(programme_en) # print(degree_name) #6.overview_en overview_en = response.xpath('//*[@id="acc1"]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #7.modules_en modules_en = response.xpath( "//*[contains(text(),'Modules')]//following-sibling::div").extract( ) modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #8.assessment_en assessment_en = response.xpath( "//*[contains(text(),'Assessment')]//following-sibling::*" ).extract() assessment_en = ''.join(assessment_en) assessment_en = clear_space_str(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #9.start_date start_date = response.xpath( '//*[@id="keyfacts-acc"]/ul/li[1]/span[2]').extract() start_date = ''.join(start_date) start_date = clear_space_str(start_date) start_date = remove_tags(start_date) if 'September' in start_date: start_date = '2018-9' elif 'October' in start_date: start_date = '2018-10' elif 'January' in start_date: start_date = '2019-1' elif '6 August 2018' in start_date: start_date = '2018-8-6' elif '9 July 2018' in start_date: start_date = '2018-7-9' else: start_date = '2018-9' # print(start_date) #10.duration #24.duration_per duration_list = response.xpath( "//*[contains(text(),'Duration/Mode')]//following-sibling::*" ).extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) duration_list = clear_space_str(duration_list) # print(duration_list) try: duration_a = re.findall('\d+', duration_list)[0] except: duration_a = 'N/A' if '6 weeks' in duration_list: duration = 6 duration_per = 4 elif '10 weeks' in duration_list: duration = 10 duration_per = 4 elif int(duration_a) > 3: duration = duration_a duration_per = 3 else: duration = duration_a duration_per = 1 # print(duration,'*******************',duration_per) #11.teach_time if 'full time' in duration_list: teach_time = 'full time' else: teach_time = 'part time' #12.ielts 13141516 ielts_list = response.xpath( "//*[contains(text(),'Language requirements')]//following-sibling::*" ).extract() ielts_list = ''.join(ielts_list) ielts = re.findall('\d\.\d', ielts_list) # print(ielts) if len(ielts) == 2: a = ielts[0] b = ielts[1] ielts = a ielts_w = b ielts_r = b ielts_s = b ielts_l = b elif len(ielts) == 1: a = ielts[0] ielts = a ielts_w = a ielts_r = a ielts_s = a ielts_l = a elif len(ielts) == 3: a = ielts[0] b = ielts[1] c = ielts[2] ielts = a ielts_w = b ielts_r = c ielts_s = c ielts_l = c else: ielts = None ielts_w = None ielts_r = None ielts_s = None ielts_l = None # print(ielts,ielts_w,ielts_s,ielts_r,ielts_l) # print(ielts_s+ielts_s) #17.department department = response.xpath( "//*[contains(text(),'This course is taught by')]/../following-sibling::*" ).extract() department = ''.join(department) department = remove_tags(department) department = clear_space_str(department) # print(department) #18.rntry_requirements rntry_requirements = response.xpath( "//*[contains(text(),'Entry requirements:')]//following-sibling::*[1]" ).extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = clear_space_str(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) if 'Full entry requirements' in rntry_requirements: rntry_requirements = rntry_requirements.replace( 'Full entry requirements', '') else: pass # print(rntry_requirements) #19.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'International fees')]//following-sibling::*[1]" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = clear_space_str(tuition_fee) tuition_fee = remove_tags(tuition_fee) try: tuition_fee = re.findall('\d+,\d+', tuition_fee)[0] tuition_fee = tuition_fee.replace(',', '') except: tuition_fee = None # print(tuition_fee) #20.tuition_fee_pre tuition_fee_pre = '£' #21.deadline deadline = response.xpath( "//*[contains(text(),'Application deadlines')]//following-sibling::*[1]|//*[contains(text(),'Application deadlines:')]/../following-sibling::*[1]" ).extract() deadline = ''.join(deadline) deadline = remove_tags(deadline) # print(deadline) if 'August' in deadline: deadline = '2018-8-31' elif 'July' in deadline: deadline = '2018-7-31' elif 'June' in deadline: deadline = '2018-6-30' elif 'April' in deadline: deadline = '2018-4-30' else: deadline = 'N/a' # print(deadline) #22.career_en career_en = response.xpath( "//h2[contains(text(),'Career opportunities')]/../following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #23.apply_proces_en apply_proces_en = 'https://application.leeds.ac.uk/login/?returnurl=%2f' #24.toefl 25262728 toefl = 92 toefl_l = 21 toefl_r = 21 toefl_s = 23 toefl_w = 22 #29.apply_pre apply_pre = '£' #30.apply_documents_en apply_documents_en = '<p>Make sure you have all your supporting documents scanned and ready to upload with your online application. All documents should be in English, or sent with certified translations into English. Without copies of the required documents we will be unable to make you an offer.</p>' item['apply_pre'] = apply_pre item['apply_documents_en'] = apply_documents_en item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['overview_en'] = overview_en item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['start_date'] = start_date item['duration'] = duration item['teach_time'] = teach_time item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['department'] = department item['rntry_requirements'] = rntry_requirements item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['deadline'] = deadline item['career_en'] = career_en item['apply_proces_en'] = apply_proces_en item['duration_per'] = duration_per yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Essex' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath('//*[@id="content"]//h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = programme_en.split()[0] # print(degree_name) programme_en = programme_en.replace(degree_name, '').strip() # print(programme_en) #6.start_date start_date = response.xpath( "//*[contains(text(),'Start date')]//following-sibling::select" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) if 'Oct 2018/19' in start_date: start_date = '2018-10,2019-10' else: start_date = '2018-9,2019-9' # print(start_date) #7.teach_time teach_time = response.xpath( "//*[contains(text(),'Study mode')]//following-sibling::select" ).extract() teach_time = ''.join(teach_time) teach_time = remove_tags(teach_time) if 'Full Time' in teach_time: teach_time = 'Full Time' else: teach_time = 'Part Time' # print(teach_time) #8.duration #9.duration_per duration_list = response.xpath( "//*[contains(text(),'Duration')]//following-sibling::*").extract( ) duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) duration_a = re.findall('\d', duration_list)[0] if duration_list == '1 years 8 months': duration = '20' duration_per = 3 elif int(duration_a) < 5: duration = duration_a duration_per = 1 else: duration = duration_a duration_per = 3 # print(duration,'(((',duration_per) #10.location location = response.xpath( "//*[contains(text(),'Location')]//following-sibling::span" ).extract() location = ''.join(location) location = remove_tags(location) # print(location) #11.department department_a = response.xpath( "//*[contains(text(),'Based in')]//following-sibling::*").extract( ) department_a = ''.join(department_a) department_a = remove_tags(department_a) if len(department_a) > 500: department = 'N/A' else: department = department_a # print(department) #12.overview_en overview_en = response.xpath('//*[@id="overview"]//p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #13.ielts 14151617 ielts_list = response.xpath( '//*[@id="entry-requirements"]//text()').extract() ielts_list = ''.join(ielts_list) ielts = re.findall('\d\.\d', ielts_list) # print(ielts) if '2.2' in ielts: ielts.remove('2.2') if '2.1' in ielts: ielts.remove('2.1') else: pass elif '2.1' in ielts: ielts.remove('2.1') else: pass # print(ielts) if len(ielts) == 2: a = ielts[0] b = ielts[1] ielts = a ielts_s = b ielts_w = b ielts_l = b ielts_r = b elif len(ielts) == 3: a = ielts[0] b = ielts[1] c = ielts[2] ielts = a ielts_w = b ielts_r = c ielts_l = c ielts_s = c elif len(ielts) == 4: a = ielts[0] b = ielts[1] ielts = a ielts_s = b ielts_w = b ielts_l = b ielts_r = b else: ielts = 6.0 ielts_w = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_s = 5.5 # print(ielts,ielts_w,ielts_r,ielts_l,ielts_s) #18.modules_en modules_en = response.xpath( "//div[@class='tabs__panels content-padding']").extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #19.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'International fee')]//following-sibling::*" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = tuition_fee.replace(',', '') tuition_fee = tuition_fee.replace('£', '') if tuition_fee == 'TBC': tuition_fee = None elif len(tuition_fee) >= 200: tuition_fee = None else: pass # print(tuition_fee) #20.tuition_fee_pre tuition_fee_pre = '£' #21.apply_proces_en apply_proces_en = 'https://www1.essex.ac.uk/pgapply/login.aspx' #22.rntry_requirements rntry_requirements = response.xpath( '//*[@id="entry-requirements"]/div//p[1]').extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #23.require_chinese_en chi_url = re.findall(r'courses/pg(.*)/', url)[0] chi_url1 = re.findall('\d+', chi_url) a = chi_url1[0] b = chi_url1[1] chi_url2 = 'https://www.essex.ac.uk/api/sitecore/coursePage/EntryRequirementInternational?mastercourseid=PG' + str( a) + '&subgroupcode=' + str(b) + '&courseyear=18&countrykey=631' data = requests.get(chi_url2) data_list = etree.HTML(data.text) require_chinese_en = data_list.xpath('/html/body/div/p/text()') require_chinese_en = ''.join(require_chinese_en) require_chinese_en = '<p>' + require_chinese_en + '</p>' # print(require_chinese_en) #24.apply_documents_en apply_documents_en = "<p>Necessary documents When you apply to study with us, you'll need to provide a number of supporting documents - we can't process your application until we have these. Some of these documents you will have to upload with your application, others you may be able to upload at a later date. We may ask to see original documents if you are offered a place. English language If you have received your test results you may include a copy with your application. The main tests we accept are IELTS, TOEFL or Pearson, and the test must be less than two years old at the time of admission. The IELTS requirement for your course is listed on our Postgraduate Research Finder. You can also see more detailed information about English language requirements here (.pdf) Transcripts Official transcript(s), in English or a certified translation of your academic results to date, showing marks or grades, must be provided at the time you make your application. (Transcripts are not required from current or previous University of Essex students, or from students who have previously completed a degree at Colchester Institute awarded by the University of Essex). CV A CV is required for some research degrees at the time of application. Research proposal Requirements vary across departments but two references and a research proposal are required for all research degrees. A research proposal is required at application stage for most research degrees. Think about your research idea - during your PhD you will conduct and present the results of your original investigations and research. You need to ensure that your research topic will be interesting enough for three or four years. Start to research your topic by reading around your subject area and begin to think what you might like to include in your research proposal. Get in touch with a suitable department by contacting the Graduate Director - you might still be developing your idea at this stage, but it would be great if you could send a short description of your research area and a copy of your CV. This does not need to be longer than one A4 page. You can search for a department or supervisor through our Postgraduate Research Finder. Writing your proposalYour research proposal is an important part of your application for a research degree. Use it to explain your personal and academic goals in undertaking an extended period of research, and reflect on the contribution you will make to the development of new knowledge, ideas and solutions. Also comment on how your research interests fit with the academic focus and expertise at Essex Your research proposal needs to demonstrate that you have, or are able to develop, the competencies and skills needed to complete your project, within the time and resources available. The quality of your writing is important and a good research proposal may be rejected if it is poorly expressed or badly presented. Many of our departments, schools and centres offer more detailed guidance on preparing a research proposal on their web pages. If you are applying for funding, ensure your proposal fulfils the requirements of your preferred funding body. Your research proposal should include: a working title and key words a summary of the aims and objectives of your research an outline of the ways you meet these aims and objectives, referring to research methods and specific resources you use evidence of your awareness of relevant literature and theoretical approaches an overview of the expected outcomes and the original contribution your research will make to existing bodies of knowledge a brief statement on how your research interests tie in with those found in the department, school or centrePersonal statement If you are applying for a taught course and you need a Tier 4 student visa to study in the UK, then a personal statement (no more than 500 words) is required at the time you make your application, and this should refer specifically to your reasons for wishing to study in the UK, and why you have chosen your area of study. Please remember to include details of any relevant work experience, why you think your academic strengths are suited to your area of study, and how this study will assist you to realise your career objectives. References We require two references from you at the application stage.References should be recent and verifiable, on official institution paper, signed and dated by the referee. If a referee wishes to provide an email reference, it must be sent from an official email account (for example, not Yahoo, Gmail or Hotmail).<\p>" #25.career_en career_en = response.xpath( "//*[contains(text(),'Your future')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #26.assessment_en assessment_en_1 = response.xpath( "//*[contains(text(),'Teaching')]//following-sibling::*").extract( ) assessment_en_1 = ''.join(assessment_en_1) assessment_en_1 = remove_class(assessment_en_1) assessment_en_2 = response.xpath( "//*[contains(text(),'Assessment')]//following-sibling::*" ).extract() assessment_en_2 = ''.join(assessment_en_2) assessment_en_2 = remove_class(assessment_en_2) assessment_en_3 = response.xpath( "//*[contains(text(),'Dissertation')]//following-sibling::*" ).extract() assessment_en_3 = ''.join(assessment_en_3) assessment_en_3 = remove_class(assessment_en_3) assessment_en = assessment_en_1 + assessment_en_2 + assessment_en_3 if len(assessment_en) > 30000: assessment_en = assessment_en[:30000] # print(assessment_en) item['apply_documents_en'] = apply_documents_en item['require_chinese_en'] = require_chinese_en item['rntry_requirements'] = rntry_requirements item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['start_date'] = start_date item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['location'] = location item['department'] = department item['overview_en'] = overview_en item['ielts'] = ielts item['ielts_w'] = ielts_w item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['modules_en'] = modules_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_proces_en'] = apply_proces_en item['career_en'] = career_en item['assessment_en'] = assessment_en yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Greenwich' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="default"]/header/div/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) programme_en = clear_space_str(programme_en) programme_en = programme_en.split(',') programme_en = ''.join(programme_en[:-1]) # print(programme_en) # print(programme_en) #4.degree_type degree_type = 1 #5.degree_name try: degree_name = re.findall(r',(.*)', programme_en)[0].strip() except: degree_name = 'N/A' if ',' in degree_name: degree_name = re.findall(r',(.*)', degree_name)[0].strip() try: programme_en = programme_en.replace(degree_name, '').replace(',', '').strip() except: pass # print(programme_en) # print(degree_name) #6.department department = response.xpath( '//i[@aria-label="Department"]//following-sibling::*').extract() department = ''.join(department) department = remove_tags(department) if '& ' in department: department = department.replace('& ', '') # print(department) #7.location location = response.xpath( '//i[@aria-label="Location"]//following-sibling::*').extract() location = ''.join(location) location = remove_tags(location) # print(location) #8.ucascode ucascode = response.xpath( '//*[@id="faculty"]/div[2]/article/div/div/div[1]/div[2]/div[2]/div/h3' ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = ucascode[:4] # print(ucascode) #9.duration #10.duration_per duration = response.xpath( '//*[@aria-label="Duration"]//following-sibling::p[1]').extract() duration = ''.join(duration) duration = remove_tags(duration) duration = duration.replace('full time', '').strip() # print(duration) duration_per = 1 #11.overview_en overview_en = response.xpath( "//div[contains(@class,'overview-text')]").extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #12.modules_en modules_en_url = response.xpath( "//meta[@name='prog_no']//@content").extract() modules_en_url = ''.join(modules_en_url) if len(modules_en_url) != 0: modules_en_url = 'https://www.gre.ac.uk/ug/content/ajax/courses-ajax-call?prog=' + str( modules_en_url) else: modules_en_url = '' if len(modules_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data = requests.get(modules_en_url, headers=headers) response1 = etree.HTML(data.text) modules_en = response1.xpath("//div[@class='gre-page-copy']") doc = "" if len(modules_en) > 0: for a in modules_en: doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc = remove_class(doc) else: modules_en = 'N/A' doc = '' print(modules_en) #13.apply_desc_en apply_desc_en = response.xpath( '//*[@id="entry-requirements"]/div').extract() apply_desc_en = ''.join(apply_desc_en) apply_desc_en = remove_class(apply_desc_en).strip() # print(apply_desc_en) #14.assessment_en try: assessment_en = response.xpath( "//h3[contains(text(),'Careers')]//preceding-sibling::*" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) except: assessment_en = '' # print(assessment_en) #15.career_en career_en = response.xpath( "//h4[contains(text(),'Do you provide employability services?')]//preceding-sibling::*" ).extract() if len(career_en) == 0: career_en = response.xpath( "//h4[contains(text(),'areers')]/following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #16.tuition_fee if 'Adult Nursing' in programme_en: tuition_fee = 13950 elif 'Business Logistics and Transport Management' in programme_en: tuition_fee = 13950 elif 'Business Purchasing and Supply Chain Management' in programme_en: tuition_fee = 13950 elif 'Business Studies' in programme_en: tuition_fee = 13950 elif 'Business with Accounting' in programme_en: tuition_fee = 13950 elif 'Business with Finance' in programme_en: tuition_fee = 13950 elif 'Business with Human Resource Management' in programme_en: tuition_fee = 13950 elif 'Business with Marketing' in programme_en: tuition_fee = 13950 elif "Children's Nursing" in programme_en: tuition_fee = 13950 elif 'Law' in programme_en: tuition_fee = 13950 elif 'Nursing' in programme_en: tuition_fee = 13950 elif 'Mental Health Work' in programme_en: tuition_fee = 13950 elif 'Midwifery' in programme_en: tuition_fee = 13950 elif 'Paramedic Science' in programme_en: tuition_fee = 13950 elif 'Specialist Community Public Health' in programme_en: tuition_fee = 13950 elif 'Study Abroad' in programme_en: tuition_fee = 13950 else: tuition_fee = 12100 #17.tuition_fee_pre tuition_fee_pre = '£' #18.apply_proces_en apply_proces_en = 'https://www.gre.ac.uk/study/apply/ug' #19.ielts 20212223 ielts = 6.5 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 #24.apply_pre apply_pre = '£' #25.alevel alevel = response.xpath( "//*[contains(text(),'UCAS points')]//following-sibling::*[1]" ).extract() if len(alevel) == 0: alevel = response.xpath( "//*[contains(text(),'points')]//text()").extract() alevel = ''.join(alevel) alevel = remove_tags(alevel) try: alevel = re.findall('(\d+)\W\(view', alevel)[0] except: alevel = '' if len(alevel) == 0: alevel = response.xpath( "//*[contains(text(),'points')]//text()").extract() alevel = ''.join(alevel) try: alevel = re.findall('(\d+)\WUCAS', alevel)[0] except: alevel = None alevel = alevel + ' UCAS points' # print(alevel) item["alevel"] = alevel item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['department'] = department item['location'] = location item['ucascode'] = ucascode item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en # item['modules_en'] = doc item['apply_desc_en'] = apply_desc_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_proces_en'] = apply_proces_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Cardiff University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en_a = response.xpath( '//*[@id="content"]/div[1]/div/div[1]/h1').extract() programme_en_a = ''.join(programme_en_a) programme_en_a = remove_tags(programme_en_a) if '(' in programme_en_a: programme_en = programme_en_a.split()[:-1] programme_en = ' '.join(programme_en) else: programme_en = programme_en_a # print(programme_en) #4.overview_en overview_en = response.xpath( '//*[@id="content"]/div[1]/div/div[1]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #5.start_date start_date = '2019-9' #6.duration #7.duration_per duration_list = response.xpath( "//*[contains(text(),'Duration')]//following-sibling::*").extract( ) duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) try: duration_a = re.findall('\d+', duration_list)[0] except: duration_a = 1 if 'five years' in duration_list: duration = 5 elif 'seven years' in duration_list: duration = 7 else: duration = duration_a if 'months' in duration_list: duration_per = 3 else: duration_per = 1 # print(duration,'********',duration_per) #8.degree_name degree_name = response.xpath( '//*[@id="content"]/div[1]/div/div[1]/h1').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) degree_name = degree_name.split()[-1] degree_name = degree_name.replace('(', '').replace(')', '').strip() # print(degree_name) #9.ucascode ucascode = response.xpath( '//*[@id="section1"]/table[1]//tr[1]/td').extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) # print(ucascode) #10.modules_en # modules_en = response.xpath('//*[@id="coreModulesList-1"]/div/table//tr/td/a').extract() # modules_en = ''.join(modules_en) # modules_en = remove_class(modules_en) # modules_en = '<p>'+modules_en+'</p>' modules_en = response.xpath('//*[@id="section2"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #11.alevel alevel = response.xpath( "//*[contains(text(),'A level ')]//following-sibling::*").extract( ) alevel = ''.join(alevel) alevel = remove_tags(alevel).strip() # print(alevel) #12.assessment_en assessment_en = response.xpath( "//*[contains(text(),'How will I be assessed?')]//following-sibling::p" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #13.career_en career_en = response.xpath( '//*[@id="section5"]/div[1]/div[1]/p').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #14.tuition_fee tuition_fee = response.xpath( '//*[@id="tuitionfees"]/table/tbody/tr/td[1]').extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #15.ib ib = response.xpath( "//*[contains(text(),'International Baccalaureate')]//following-sibling::*" ).extract() ib = ''.join(ib) ib = remove_tags(ib).strip() # print(ib) #16.tuition_fee_pre tuition_fee_pre = '£' #17 18192021 if 'Dentistry' in programme_en: ielts = 7.0 ielts_s = 6.5 ielts_r = 6.5 ielts_l = 6.5 ielts_w = 6.5 elif 'Medicine' in programme_en: ielts = 7.0 ielts_s = 6.5 ielts_r = 6.5 ielts_l = 6.5 ielts_w = 6.5 elif 'Law' in programme_en: ielts = 6.5 ielts_s = 6.5 ielts_r = 6.0 ielts_l = 6.0 ielts_w = 6.0 elif 'Politics' in programme_en: ielts = 6.5 ielts_s = 6.5 ielts_r = 6.0 ielts_l = 6.0 ielts_w = 6.0 else: ielts = 6.5 ielts_s = 5.5 ielts_r = 5.5 ielts_l = 5.5 ielts_w = 5.5 # print(ielts,ielts_s,ielts_r,ielts_l,ielts_w) #22.degree_type degree_type = 1 item['ib'] = ib item['alevel'] = alevel item['ucascode'] = ucascode item['degree_name'] = degree_name item['university'] = university item['url'] = url item['programme_en'] = programme_en item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['duration_per'] = duration_per item['degree_type'] = degree_type item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['ielts'] = ielts item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'London Metropolitan University' # print(university) # 2.url url = response.url #3.programme_en programme_en = response.xpath( '/html/body/div[1]/div/h1/span').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en).strip() # print(programme_en,url) #4.degree_type degree_type = 2 #5.degree_name try: degree_name = re.findall(r'-\s[A-Za-z\s/]+$', programme_en)[0] except: degree_name = '' programme_en = programme_en.replace(degree_name, '').strip() degree_name = degree_name.replace('-', '').strip() # print(degree_name) #6.overview_en overview_en = response.xpath( '//*[@id="why-study-this-course"]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #7.assessment_en assessment_en = response.xpath( "//*[contains(text(),'Assessment')]//following-sibling::*" ).extract() assessment_en = ''.join(assessment_en) assessment_en = clear_space_str(assessment_en) # print(assessment_en) #8.modules_en modules_en = response.xpath('//*[@id="modular-structure"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en).replace('▼', '') # print(modules_en) #9.career_en career_en = response.xpath('//*[@id="after-the-course"]').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #10.ielts 11121314 if 'Education' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'Creative, Digital and Professional Writing' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'Interpreting' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'LLM' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'Psychology' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'Teaching Languages (English) - MA' in programme_en: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 elif 'Biomedical Science - MSc' in programme_en: ielts = 7.0 ielts_w = 6.5 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 elif 'Blood Science (Distance Learning) - MSc' in programme_en: ielts = 7.0 ielts_w = 6.5 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 elif 'Common Professional Exam' in programme_en: ielts = 7.0 ielts_w = 6.5 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 elif 'Legal Practice Course' in programme_en: ielts = 7.0 ielts_w = 6.5 ielts_r = 6.5 ielts_s = 6.5 ielts_l = 6.5 else: ielts = 6.0 ielts_w = 5.5 ielts_r = 5.5 ielts_s = 5.5 ielts_l = 5.5 # print(ielts,ielts_l,ielts_r,ielts_s,ielts_w) #15.tuition_fee tuition_fee = response.xpath( "//optgroup[@label='International']/option[1]/@data-cost").extract( ) tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #16.duration 17.duration_per duration_list = response.xpath( "//optgroup[@label='International']/option[1]/@data-duration" ).extract() duration_list = ''.join(duration_list) if len(duration_list) != 0: duration = re.findall('\d+', duration_list)[0] if 'months' in duration_list: duration_per = 3 elif 'year' in duration_list: duration_per = 1 elif 'weeks' in duration_list: duration_per = 4 else: duration_per = None else: duration = None duration_per = None # print(duration,"*************",duration_per) #18.tuition_fee_pre tuition_fee_pre = '£' #19.teach_time teach_time = 'Full time' #20.location location = 'London' #21.apply_pre apply_pre = '£' #22.rntry_requirements rntry_requirements = response.xpath( '//*[@id="entry-requirements"]').extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #23.require_chinese_en require_chinese_en = "<p>A completed bachelor's degree from a high ranking Chinese institution Grade: 70% or above</p>" item['require_chinese_en'] = require_chinese_en item['rntry_requirements'] = rntry_requirements item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['overview_en'] = overview_en item['assessment_en'] = assessment_en item['modules_en'] = modules_en item['career_en'] = career_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['duration'] = duration item['duration_per'] = duration_per item['teach_time'] = teach_time item['location'] = location yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Leeds' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="main"]/div/header/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 1 #5.degree_name degree_name_list = programme_en.split() degree_name = degree_name_list[-1] programme_en = programme_en.replace(degree_name, '').strip().replace(',', '') # print(programme_en) # print(degree_name) #6.overview_en overview_en = response.xpath('//*[@id="acc1"]/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #7.modules_en modules_en = response.xpath( "//*[contains(text(),'Modules')]//following-sibling::*[position()<7]" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # end = modules_en.find() # modules_en = clear_space_str(modules_en) # print(modules_en) #8.assessment_en assessment_en = response.xpath( "//*[contains(text(),'Assessment')]//following-sibling::*" ).extract() assessment_en = ''.join(assessment_en) assessment_en = clear_space_str(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #9.start_date start_date = '2019-9' # print(start_date) #10.duration #24.duration_per duration_list = response.xpath( "//*[contains(text(),'Duration/Mode')]//following-sibling::*" ).extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) duration_list = clear_space_str(duration_list) # print(duration_list) try: duration = re.findall('\d+', duration_list)[0] except: duration = 3 if int(duration) > 10: duration_per = 3 else: duration_per = 1 # print(duration,'*******************',duration_per) #11.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS code:')]//following-sibling::*" ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = clear_space_str(ucascode) # print(ucascode) #12.ielts 13141516 ielts_list = response.xpath('//*[@id="acc3"]').extract() ielts_list = ''.join(ielts_list) ielts = re.findall(r'[567]\.\d', ielts_list) if len(ielts) > 1: a = ielts[0] b = ielts[1] ielts = a ielts_r = b ielts_l = b ielts_w = b ielts_s = b elif len(ielts) == 1: a = ielts[0] ielts = a ielts_r = a ielts_l = a ielts_w = a ielts_s = a else: ielts = None ielts_r = None ielts_l = None ielts_w = None ielts_s = None # print(ielts,ielts_w,ielts_s,ielts_r,ielts_l) # print(ielts_s+ielts_s) #17.department department = response.xpath( "//*[contains(text(),'This course is taught by')]/../following-sibling::*" ).extract() department = ''.join(department) department = remove_tags(department) department = clear_space_str(department) # print(department) #18.alevel alevel = response.xpath('//*[@id="acc3"]/p[1]').extract() alevel = ''.join(alevel) alevel = remove_tags(alevel).replace('A-level:', '').strip() # print(alevel) #19.tuition_fee tuition_fee = response.xpath('//*[@id="acc3"]').extract() tuition_fee = ''.join(tuition_fee) tuition_fee = clear_space_str(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #20.tuition_fee_pre tuition_fee_pre = '£' #21.ib ib = response.xpath( "//*[contains(text(),'International Baccalaureate')]//following-sibling::*" ).extract() try: ib = ib[-1] ib = remove_tags(ib) except: ib = 'N/A' # print(ib) #22.career_en career_en = response.xpath( "//h2[contains(text(),'Career opportunities')]/../following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) #23.apply_proces_en apply_proces_en = 'https://application.leeds.ac.uk/login/?returnurl=%2f' #24.toefl 25262728 toefl = 87 toefl_l = 20 toefl_r = 20 toefl_s = 22 toefl_w = 21 #29.apply_pre apply_pre = '£' #30.apply_documents_en apply_documents_en = '<p>Make sure you have all your supporting documents scanned and ready to upload with your online application. All documents should be in English, or sent with certified translations into English. Without copies of the required documents we will be unable to make you an offer.</p>' item['ib'] = ib item['alevel'] = alevel item['apply_pre'] = apply_pre item['apply_documents_en'] = apply_documents_en item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['overview_en'] = overview_en item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['start_date'] = start_date item['duration'] = duration item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['department'] = department item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['career_en'] = career_en item['apply_proces_en'] = apply_proces_en item['duration_per'] = duration_per ucascode_a = response.xpath( "//*[contains(text(),'UCAS code:')]//following-sibling::span//text()" ).extract() if len(ucascode_a) > 1: for i in ucascode_a: ucascode_list = i ucascode_list = ucascode_list.strip() ucascode_a = re.findall(r':(.*)', ucascode_list)[0].strip() degree_name_a = re.findall(r'(.*):', ucascode_list)[0].strip() item['ucascode'] = ucascode_a item['degree_name'] = degree_name_a yield item else: item['ucascode'] = ucascode item['degree_name'] = degree_name yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'The University of Western Australia' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract( ) programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) if 'Master of' in programme_en: programme_en = programme_en.replace('Master of', '').strip() # print(programme_en) #4.overview_en overview_en = response.xpath( '//*[@id="course-details"]/div/div/div/section/div[1]/div[1]/div[1]/div/div' ).extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #5.modules_en modules_en = response.xpath( "//h2[contains(text(),'Course structure details')]//following-sibling::*" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #6.teach_time teach_time = 'coursework' #7.location location = response.xpath( "//*[contains(text(),'Locations')]//following-sibling::*").extract( )[0] # location = ''.join(location) location = remove_tags(location).strip() location = clear_space_str(location) # print(location) #8.start_date start_date = response.xpath( "//*[contains(text(),'Starting dates')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date).strip() if 'January' in start_date: start_date = '2019-1' else: start_date = 'Semester1,Semester2' # print(start_date) #9.career_en career_en = response.xpath( '//*[@id="careers-and-further-study"]/div/div/div/section/div[2]/div/div/div/a' ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en).strip() # print(career_en) #10.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'fee')]//following-sibling::div").extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) if tuition_fee == 0: tuition_fee = response.xpath( "//*[contains(text(),'Fee')]//following-sibling::div").extract( ) tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee,response.url) #11.tuition_fee_pre tuition_fee_pre = '$' #12.rntry_requirements_en rntry_requirements_en = response.xpath( "//*[contains(text(),'Admission Requirements')]//following-sibling::div" ).extract() rntry_requirements_en = ''.join(rntry_requirements_en) rntry_requirements_en = remove_class(rntry_requirements_en) # print(rntry_requirements_en) #13.ielts 14151617 if 'MBA' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Health' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Educational Leadership' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Forensic Odontology' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Dental Medicine' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Clinical Dentistry' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif programme_en == 'Medicine': ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Podiatric Medicine' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Clinical Neuropsychology' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Clinical Psychology' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Clinical Audiology' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif 'Clinical Audiology' in programme_en: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif programme_en == 'Industrial and Organisational Psychology': ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif programme_en == 'Pharmacy': ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif programme_en == 'Social Work': ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 elif programme_en == 'Education': ielts = 7.5 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif programme_en == 'Teaching': ielts = 7.5 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 8.0 ielts_s = 8.0 elif 'Law' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Juris Doctor' in programme_en: ielts = 7.5 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 else: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_l = 6.0 ielts_s = 6.0 #18.toefl 19202122 if 'Law' in programme_en: toefl = 100 toefl_s = 28 toefl_l = 26 toefl_r = 26 toefl_w = 26 elif 'Juris Doctor' in programme_en: toefl = 106 toefl_s = 28 toefl_l = 26 toefl_r = 26 toefl_w = 26 elif 'MBA' in programme_en: toefl = 100 toefl_s = 20 toefl_l = 20 toefl_r = 20 toefl_w = 20 elif 'MBA' in programme_en: toefl = 100 toefl_s = 20 toefl_l = 20 toefl_r = 20 toefl_w = 20 elif 'Clinical Neuropsychology' in programme_en: toefl = 94 toefl_s = 23 toefl_l = 24 toefl_r = 24 toefl_w = 27 elif 'Clinical Psychology' in programme_en: toefl = 94 toefl_s = 23 toefl_l = 24 toefl_r = 24 toefl_w = 27 elif 'Clinical Audiology' in programme_en: toefl = 94 toefl_s = 23 toefl_l = 24 toefl_r = 24 toefl_w = 27 elif 'Industrial and Organisationa' in programme_en: toefl = 94 toefl_s = 23 toefl_l = 24 toefl_r = 24 toefl_w = 27 else: toefl = 82 toefl_s = 20 toefl_l = 20 toefl_r = 18 toefl_w = 22 #23.apply_proces_en apply_proces_en = 'Check your chosen course is open to applications. Ensure you meet the admission requirements for this course as detailed on the previous tab. Ensure you meet our English language competency requirement and any course/major prerequisites. Apply' #24.apply_pre apply_pre = '$' #25.apply_fee apply_fee = 100 #26.degree_name degree_name = response.xpath( '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract( ) degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #27.degree_type degree_type = 2 #28.duration duration = response.xpath( "//*[contains(text(),'duration')]//following-sibling::*[1]//ul//li" ).extract() duration = ''.join(duration) if '<li>1.5' in duration: duration = 1.5 elif '<li>1 to 2' in duration: duration = '1/2' elif '<li>0.5-1.5' in duration: duration = '0.5/1.5' elif '<li>2-3' in duration: duration = '2/3' elif '<li>One' in duration: duration = 1 elif '<li>Two' in duration: duration = 2 else: duration = re.findall(r'\d+', duration)[0] # print(duration,url) item['duration'] = duration item['degree_type'] = degree_type item['degree_name'] = degree_name item['university'] = university item['url'] = url item['programme_en'] = programme_en item['overview_en'] = overview_en item['modules_en'] = modules_en item['teach_time'] = teach_time item['location'] = location item['start_date'] = start_date item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['rntry_requirements_en'] = rntry_requirements_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_l'] = toefl_l item['toefl_w'] = toefl_w item['apply_proces_en'] = apply_proces_en item['apply_pre'] = apply_pre item['apply_fee'] = apply_fee yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'The University of Western Australia' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="page-content"]/div[1]/div[3]/div/div/div[3]/h1').extract( ) programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.overview_en overview_en = response.xpath( '//*[@id="course-details"]/div/div/div/section/div[1]/div[1]/div[1]/div/div' ).extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #5.modules_en modules_en = response.xpath( "//h2[contains(text(),'Course structure details')]//following-sibling::*" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) # print(modules_en) #6.apply_pre apply_pre = '$' #8.start_date start_date = response.xpath( "//*[contains(text(),'Starting dates')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date).strip() if 'January' in start_date: start_date = '2019-1' else: start_date = 'Semester1,Semester2' # print(start_date) #9.career_en career_en = response.xpath( '//*[@id="careers-and-further-study"]/div/div/div/section/div[2]/div/div/div/a' ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en).strip() # print(career_en) #10.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'Fee')]//following-sibling::div").extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee,response.url) #11.tuition_fee_pre tuition_fee_pre = '$' #12.rntry_requirements_en rntry_requirements_en = response.xpath( "//h3[contains(text(),'Admission requirements')]//following-sibling::div[1]" ).extract() rntry_requirements_en = ''.join(rntry_requirements_en) rntry_requirements_en = remove_class(rntry_requirements_en) # print(rntry_requirements_en) #13.ielts 14151617 if 'Law' in programme_en: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 else: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_l = 6.0 ielts_s = 6.0 #18.toefl 19202122 if 'Law' in programme_en: toefl = 100 toefl_s = 28 toefl_l = 26 toefl_r = 26 toefl_w = 26 else: toefl = 82 toefl_s = 20 toefl_l = 20 toefl_r = 18 toefl_w = 22 #23.apply_proces_en apply_proces_en = 'Check your chosen course is open to applications. Ensure you meet the admission requirements for this course as detailed on the previous tab. Ensure you meet our English language competency requirement and any course/major prerequisites. Apply' #24.apply_fee apply_fee = 100 #25.china_score_requirements try: china_score_requirements = response.xpath( "//*[contains(text(),'Chinese Gao Kao')]//following-sibling::div/ul/li" ).extract()[0] china_score_requirements = remove_tags(china_score_requirements) except: china_score_requirements = '' #26.degree_type degree_type = 1 #28.department department = response.xpath( "//*[contains(text(),'Faculty')]//following-sibling::div[1]/ul/li[1]" ).extract() department = ''.join(department) department = remove_tags(department).replace('&', '') # print(department) #29.duration_per duration_per = 1 #30.duration duration_list = response.xpath( "//*[contains(text(),'duration')]//following-sibling::div[1]/ul/li[1]" ).extract() duration_list = ''.join(duration_list) # print(duration_list) try: duration = re.findall('\d', duration_list)[0] except: duration = 3 # print(duration) # 7.location location = response.xpath( "//*[contains(text(),'Locations')]//following-sibling::*").extract( )[0] # location = ''.join(location) location = remove_tags(location).strip() # location = clear_space_str(location) if 'Perth' in location and 'Albany' in location: location = 'Albany,Perth' else: location = location item['location'] = location item['duration'] = duration item['duration_per'] = duration_per item['department'] = department item['degree_type'] = degree_type item['university'] = university item['url'] = url item['programme_en'] = programme_en item['overview_en'] = overview_en item['modules_en'] = modules_en item['start_date'] = start_date item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['rntry_requirements_en'] = rntry_requirements_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_l'] = toefl_l item['toefl_w'] = toefl_w item['apply_proces_en'] = apply_proces_en item['apply_pre'] = apply_pre item['apply_fee'] = apply_fee item['china_score_requirements'] = china_score_requirements # 27.degree_name degree_name = response.xpath( "//*[contains(text(),'Degrees course is available in')]//following-sibling::div[1]//ul//li" ).extract() if len(degree_name) != 0: for i in degree_name: degree_name = i degree_name = degree_name.replace('<li>', '').replace('</li>', '') item['degree_name'] = degree_name yield item else: item['degree_name'] = '' yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Sheffield Hallam University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath("/html/body/section[1]//h1").extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en,response.url) #4.degree_type degree_type = 1 #5.degree_name degree_name = response.xpath('/html/body/section[1]/div/div[2]/span').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #6.tuition_fee tuition_fee = response.xpath("//*[contains(text(),'What is the fee?')]//following-sibling::*").extract() tuition_fee = ''.join(tuition_fee) tuition_fee =getTuition_fee(tuition_fee) # print(tuition_fee) #7.tuition_fee_pre tuition_fee_pre = '£' #8.duration duration_list = response.xpath("//*[contains(text(),'How long will I study?')]//following-sibling::*").extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list).strip() duration =duration_list duration_per = 1 #9.location location = 'Sheffield' #10.ucascode ucascode = response.xpath("//*[contains(text(),'What is the UCAS code?')]//following-sibling::*").extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = clear_space_str(ucascode) # print(ucascode) #11.overview_en overview_en = response.xpath("//*[contains(text(),'Course summary')]//following-sibling::*").extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #12.career_en career_en = response.xpath("//*[contains(text(),'Future careers')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #13.alevel alevel = response.xpath('//*[@id="entry-requirements"]/div/div[1]/ul[2]/li[1]').extract() alevel = ''.join(alevel) alevel = remove_tags(alevel) # print(alevel) #14.apply_proces_en apply_proces_en = response.xpath('//*[@id="apply-now"]/div[1]//a/@href').extract() apply_proces_en = ''.join(apply_proces_en) # print(apply_proces_en) #16.duration_per duration_per = 1 #17.ielts_desc ielts_desc = response.xpath('//*[@id="entry-requirements"]/div/div[1]').extract() ielts_desc = ''.join(ielts_desc) ielts_desc = remove_tags(ielts_desc) ielts_list = re.findall(r'[567]\.\d',ielts_desc) # print(ielts_list,response.url) if len(ielts_list) == 2: a = ielts_list[0] b = ielts_list[1] ielts = a ielts_r = b ielts_l = b ielts_s = b ielts_w = b else: ielts = None ielts_r = None ielts_l = None ielts_s = None ielts_w = None #18.require_chinese_en require_chinese_en = '<p>The following qualifications from China will be considered for entry on to undergraduate programmes, with a minimum average of 60 per cent: Diploma from Specialised College (Zhongzhnan) Diploma from Vocational Secondary School (Zhixiao) Three year middle school diploma plus foundation degree A levels Graduate Diploma from: Radio and TV Universities Spare Time Universities Training Colleges for Administrative cadres Higher Education Self Study Examinations Adult Education/Adult Education in Science and Technology subjects Senior High School Diploma Chinese University Entrance Examination (until 2003) College Graduation Diploma (Dazhuan awarded by university/college on completion of 2-3 years study) Applicants who have completed the first year of an undergraduate degree at a Chinese university may be considered for direct entry to Sheffield Hallam University undergraduate programmes.Sheffield Hallam welcomes applications from international school students taking the International Baccalaureate Diploma and those achieving 28 points or more will usually be successful in obtaining an offer of a place on our undergraduate programmes. For information about IB points equivalences against the UCAS tariff please visit the UCAS website.</p>' #19.apply_fre apply_pre = '£' #20.start_date start_date = response.xpath("//*[contains(text(),'When do I start?')]//following-sibling::*").extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = clear_space_str(start_date) # print(start_date) if'September, January' in start_date: start_date = '2018-9,2019-1' elif 'January' in start_date: start_date = '2019-1' else: start_date = translate_month(start_date) start_date = '2018-'+str(start_date) # print(start_date) #21.modules_en modules_en = response.xpath('//div[@data-section="split"][6]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) item['modules_en'] = modules_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['start_date'] = start_date item['apply_pre'] = apply_pre item['require_chinese_en'] = require_chinese_en item['ielts_desc'] = ielts_desc item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['duration'] = duration item['location'] = location item['ucascode'] = ucascode item['overview_en'] = overview_en item['career_en'] = career_en item['alevel'] = alevel item['apply_proces_en'] = apply_proces_en item['duration_per'] = duration_per yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) # 1.university university = 'Goldsmiths, University of London' # print(university) # 2.department try: department = response.xpath('//*[@id="maincontent"]/article/section[1]/div/div/div/div[1]/p/a').extract() department = ''.join(department) department = clear_space_str(department) department = remove_tags(department) # print(department) except: department = 'N/A' # 3.programme_en try: programme_en_a = response.xpath('//*[@id="maincontent"]/article/header/div/div/div/div[1]/div[1]/div/h1/span').extract() programme_en_a = ''.join(programme_en_a) # programme_en_a = clear_space_str(programme_en_a) programme_en_a = remove_tags(programme_en_a) # programme_en_a = programme_en_a.replace('&','') # print(programme_en) except: programme_en = 'N/A' programme_en = programme_en_a.split()[2:] programme_en = ' '.join(programme_en) if ';'in programme_en: programme_en = programme_en.replace(';',' ') if 'in ' in programme_en: programme_en = programme_en.replace('in ','') programme_en = programme_en.strip() # print(programme_en,response.url) # 4.overview_en try: overview_en = response.xpath('//*[@id="maincontent"]/article/section[2]/div/div/div').extract() overview_en = ''.join(overview_en) overview_en = clear_space_str(overview_en) overview_en = remove_class(overview_en) # print(overview_en) except: overview = 'N/A' # 5.duration try: duration = response.xpath('//*[@id="maincontent"]/article/section[1]/div/div/div/div[2]/p').extract() duration = ''.join(duration) duration = re.findall('\d',duration)[0] duration = remove_tags(duration) # print(duration) except: duration = None #6.duration_per duration_per = 1 # 7.modules_en try: modules_en = response.xpath('//*[@id="maincontent"]/article/section[3]/div/div').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) modules_en = clear_space_str(modules_en) except: modules_en= 'N/A' # 8.career_en try: career_en = response.xpath('//*[@id="maincontent"]/article/section[7]/div/div').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) except: career_en = 'N/A' # 9.other other = 'https://www.gold.ac.uk/media/study-section/fees/PG-Fees-1819.pdf' #10.apply_proces_en apply_proces_en = response.xpath('//*[@id="maincontent"]/article/section[6]/div/div').extract() apply_proces_en = ''.join(apply_proces_en) apply_proces_en = clear_space_str(apply_proces_en) apply_proces_en = remove_class(apply_proces_en) # print(apply_proces_en) # 11.-15.雅思(听说读写) try: IELTS_list = response.xpath( '//h3[contains(text(),"International qualifications")]/following-sibling::p').extract() IELTS_list = ''.join(IELTS_list) IELTS_list = remove_tags(IELTS_list) pat = re.findall('\d\.\d', IELTS_list) if len(pat) == 3: ielts = pat[0] ielts_w = pat[1] ielts_r = pat[2] ielts_s = pat[2] ielts_l= pat[2] elif len(pat) == 2: ielts = pat[0] ielts_w = pat[1] ielts_r = None ielts_s = None ielts_l = None else: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 ielts = clear_space_str(ielts) ielts_r = clear_space_str(ielts_r) ielts_w = clear_space_str(ielts_w) ielts_s = clear_space_str(ielts_s) ielts_l = clear_space_str(ielts_l) # print(ielts) # print(ielts_r,ielts_w,ielts_s,ielts_l) except: ielts = 6.5 ielts_w = 6.0 ielts_r = 6.0 ielts_s = 6.0 ielts_l = 6.0 # 14.rntry_requirements rntry_requirements =response.xpath('//*[@id="maincontent"]/article/section[4]/div/div').extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) # 15.apply_documents_en try: apply_documents_en = response.xpath( '//h3[contains(text(),"When to apply")]/preceding-sibling::ul').extract() apply_documents_en = ''.join(apply_documents_en) apply_documents_en = clear_space_str(apply_documents_en) apply_documents_en = remove_class(apply_documents_en) # print(apply_documents_en) except: apply_documents_en = 'N/A' # 16.url url = response.url # print(url) #17.degree_type degree_type = 2 #18.degree_name if 'MA' in programme_en_a: degree_name = 'MA' elif 'MSc' in programme_en_a: degree_name = 'MSc' elif 'PGCert' in programme_en_a: degree_name = 'PGCert' elif 'MMus' in programme_en_a: degree_name = 'MMus' elif 'MRes' in programme_en_a: degree_name = 'MRes' elif 'MPhil' in programme_en_a: degree_name = 'MPhil' elif 'MFA' in programme_en_a: degree_name = 'MFA' elif 'MMus' in programme_en_a: degree_name = 'MMus' elif 'PhD' in programme_en_a: degree_name = 'PhD' else: degree_name = 'Graduate' # print(degree_name) #19.location location = 'London' #20.apply_pre apply_pre = '£' #21.require_chinese_en require_chinese_en = '<p>Postgraduate taught For entry to postgraduate programmes you will normally need a Bachelors degree in relevant subject. Refer to individual course pages to see whether there are any additional application requirements.Research degrees You will normally need to have completed a Masters degree in a subject relevant to your proposed postgraduate study. There may also be other specific entrance requirements. You can refer to individual course pages to find out what these are.</p>"' #22.assessment_en assessment_en = response.xpath("//*[contains(text(),'Assessment')]//following-sibling::*").extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en,url) item['assessment_en'] = assessment_en item['require_chinese_en'] = require_chinese_en item['apply_pre'] = apply_pre item['university'] = university item['department'] = department item['programme_en'] = programme_en item['overview_en'] = overview_en item['duration'] = duration item['duration_per'] = duration_per item['modules_en'] = modules_en item['career_en'] = career_en item['other'] = other item['apply_proces_en'] = apply_proces_en item['ielts'] = ielts item['ielts_w'] = ielts_w item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['rntry_requirements'] = rntry_requirements item['apply_documents_en'] = apply_documents_en item['url'] = url item['degree_type'] = degree_type item['degree_name'] = degree_name item['location'] = location yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Teesside University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath( '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/text()' ).extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en).strip() # print(programme_en) #4.degree_type degree_type = 1 #5.degree_name degree_name = response.xpath( '//*[@id="coursepage"]/section[1]/div[1]/div/div[1]/h1/span' ).extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name).strip() if '(Hons)' in degree_name: degree_name = degree_name.replace('(Hons)', '').strip() # print(degree_name) #6.overview_en overview_en = response.xpath( '//*[@id="tab1"]/div/div[1]/div').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #7.modules_en modules_en = response.xpath( '//*[@id="tab2"]/div[1]/div/div[1]/div[1]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) print(modules_en) #8.assessment_en assessment_en = response.xpath( '//*[@id="tab2"]/div[1]/div/div[1]/div[3]/p').extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #9.career_en career_en = response.xpath( "//*[contains(text(),'Career opportunities')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #10.ucascode ucascode = response.xpath( '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[2]/p/text()' ).extract() ucascode = ''.join(ucascode) ucascode = clear_space_str(ucascode) try: ucascode = ucascode[:4] except: ucascode = 'N/A' # print(ucascode) #11.department department = response.xpath( '//*[@id="coursepage"]/section[1]/div[1]/div/div[2]/div/div[3]/a/p' ).extract() department = ''.join(department) department = remove_tags(department) department = department.replace('& ', '') # print(department) #12.duration duration = response.xpath( '//*[@id="courseinfopdf"]/div[1]/ul/li[1]').extract() duration = ''.join(duration) duration = remove_tags(duration) # print(duration) #13.tuition_fee tuition_fee = 11825 #14.apply_desc_en apply_desc_en = response.xpath( '//*[@id="tab3"]/div/div[1]/div').extract() apply_desc_en = ''.join(apply_desc_en) apply_desc_en = remove_class(apply_desc_en) # print(apply_desc_en) #15.start_date start_date = '2018-10-13' #16.tuition_fee_pre tuition_fee_pre = '£' #17.other other = 'https://www.tees.ac.uk/sections/international/fees.cfm' #18.require_chinese_en require_chinese_en = '<p>For entry onto a Foundation or Extended programme, applicants require: Huikao (Chinese senior secondary school graduation certificate) Successful completion of the first two years of Senior Secondary School with a minimum average of 70% or successful completion of Senior Secondary School with a minimum average of 60% For entry onto an Undergraduate programme, applicants require: For entry onto Year 1:Huikao (Chinese senior secondary school graduation certificate) Successful completion of Senior Secondary School with a minimum average of 80% Or Gaokao (Chinese university or college entrance exam) with a minimum score of 500 For entry onto Higher National Diploma: Gaokao with a minimum score of 450 For entry onto Integrated Master of Engineering – MEng (Hons): Gaokao with a minimum score of 550 For entry onto Undergraduate top-up programmes (third-year entry) Dazhuan (three-year college graduation diploma) with a minimum of 70% average or, SQA Higher National Diploma with BBC as minimum or, Edexcel Higher National Diploma – standard UK entry requirements or, UK accredited foundation degree</p>' #19.ielts,20212223 if 'Dental Hygiene and Dental Therapy' in degree_name: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Diagnostic Radiography' in degree_name: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Midwifery' in degree_name: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Physiotherapy' in degree_name: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Occupational Therapy' in degree_name: ielts = 7.0 ielts_r = 6.5 ielts_w = 6.5 ielts_l = 6.5 ielts_s = 6.5 elif 'Nursing Studies' in degree_name: ielts = 7.0 ielts_r = 7.0 ielts_w = 7.0 ielts_l = 7.0 ielts_s = 7.0 else: ielts = 6 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 if 'Fine Art' in programme_en: ielts = 5.5 elif 'Design' in programme_en: ielts = 5.5 elif 'Media Production' in programme_en: ielts = 5.5 elif 'Engineering' in programme_en: ielts = 5.5 elif 'Science' in programme_en: ielts = 5.5 elif 'Computing' in programme_en: ielts = 5.5 elif 'Media Studies' in programme_en: ielts = 5.5 elif 'Journalism' in programme_en: ielts = 5.5 elif 'Business' in programme_en: ielts = 6.0 elif 'English' in programme_en: ielts = 6.0 elif 'Sport' in programme_en: ielts = 6.0 elif 'History' in programme_en: ielts = 6.0 elif 'Psychology' in programme_en: ielts = 6.0 elif ' Criminology' in programme_en: ielts = 6.0 elif 'Sociology' in programme_en: ielts = 6.0 elif 'Youth Studies' in programme_en: ielts = 6.0 elif 'Education' in programme_en: ielts = 6.0 elif 'Law' in programme_en: ielts = 6.0 elif 'Crime' in programme_en: ielts = 6.0 elif 'Investigation' in programme_en: ielts = 6.0 elif 'Health' in programme_en: ielts = 7.0 else: ielts = 6.0 # print(ielts,ielts_w,ielts_l,ielts_r,ielts_s) #24.apply_pre apply_pre = '£' #25.alevel # alevel = response.xpath('') item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['overview_en'] = overview_en item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['ucascode'] = ucascode item['department'] = department item['duration'] = duration item['tuition_fee'] = tuition_fee item['apply_desc_en'] = apply_desc_en item['start_date'] = start_date item['other'] = other item['tuition_fee_pre'] = tuition_fee_pre item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['require_chinese_en'] = require_chinese_en item['apply_pre'] = apply_pre
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'University of Sunderland' # print(university) #2.url url = response.url # print(url) #3.degree_type degree_type = 1 #4.degree_name degree_name = response.xpath( '/html/body/div[2]/header/div/div[1]/h1/span[1]').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) if '(Hons)' in degree_name: degree_name = degree_name.replace('(Hons)', '').strip() # print(degree_name,response.url) #5.programme_en programme_en = response.xpath( '/html/body/div[2]/header/div/div[1]/h1/text()').extract() programme_en = ''.join(programme_en) programme_en = clear_space_str(programme_en).strip() # print(programme_en) #6.duration duration = response.xpath( '/html/body/div[2]/header/aside/div/ul/li[1]/span').extract() duration = ''.join(duration) duration = remove_tags(duration) if 'Full/Part' in duration: duration = '' # print(duration,'*(*&*(&(*',duration_per) # 8.start_date start_date_list = response.xpath( "//*[contains(text(),'Next start date')]//*").extract() start_date_list = ''.join(start_date_list) start_date_list = remove_tags(start_date_list) # start_date = tracslateDate(start_date) try: start_date = re.findall('\d+', start_date_list)[0] except: start_date = '' if 'Oct' in start_date_list: start_date = '2018-10-' + str(start_date) elif 'Aug' in start_date_list: start_date = '2018-8-' + str(start_date) elif 'Sep' in start_date_list: start_date = '2018-9-' + str(start_date) elif 'Jan' in start_date_list: start_date = '2018-11-' + str(start_date) elif 'Nov' in start_date_list: start_date = '2019-1-' + str(start_date) else: start_date = '' # print(start_date_list,start_date) #9.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),' International fee')]//*|//*[contains(text(),'Tuition fee')]//*" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) if ',' in tuition_fee: tuition_fee = getTuition_fee(tuition_fee) else: try: tuition_fee = re.findall('\d+', tuition_fee)[0] except: tuition_fee = None # print(tuition_fee) #10.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS code')]/..").extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode).replace('UCAS code', '').strip() if len(ucascode) > 50: ucascode = ucascode[:50] # print(ucascode) #11.tuition_fee_pre tuition_fee_pre = '£' #12.overview_en overview_en = response.xpath( "//*[contains(text(),'Overview')]//following-sibling::p").extract( ) overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #13.modules_en modules_en = response.xpath('//*[@id="course-years"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #14.apply_desc_en apply_desc_en = response.xpath( "//*[contains(text(),'Entry requirements')]//following-sibling::*" ).extract() apply_desc_en = ''.join(apply_desc_en) apply_desc_en = remove_class(apply_desc_en) # print(apply_desc_en) #15.ielts 16171819 ielts = 6.0 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 # print(ielts,ielts_r,ielts_s,ielts_l,ielts_w) #20.career_en career_en = response.xpath( "//*[contains(text(),'Employment')]//following-sibling::*" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #21.apply_pre apply_pre = '£' #22.apply_proces_en apply_proces_en = '<p>When you are ready to make your application, there are six ways of applying. Choose the option which is the most convenient for you: Option 1: The Universitys overseas offices If you would like to apply to a course, you can contact one of our overseas offices to start the application process.Our overseas offices are able to answer any questions you may have about studying in the United Kingdom. Contact one of our offices in China, Malaysia, India, Vietnam or Greece to begin your application.Option 2: UCAS – undergraduate onlyInternational and UK students apply for undergraduate courses in the same way – through the Universities and Colleges Admissions Service (UCAS) website.The UCAS institution code for the University is S84.Option 3: Apply online – postgraduate only To study at postgraduate level, you need to apply to the University of Sunderland directly.Find the postgraduate course you’re interested in, and on the course page you will see a link to either apply online or download an application form (.pdf).Option 4: Email your application directly To apply for undergraduate and postgraduate courses at the main University of Sunderland campuses, email your completed application form (.pdf) to [email protected]. To apply for undergraduate and postgraduate courses at the University of Sunderland in London, email your completed application form (.pdf) to [email protected]. Option 5: In-country representatives If you live outside the UK, make your application by finding the most convenient contact from our in-country representatives.Once you have completed and submitted your application, you will be given a unique Personal ID number so you can be kept up-to-date with any developments in your application process. Option 6: Apply through a study centre To study with us in your country, through one of the University of Sunderlands study centres, you must apply directly through the relevant study centre. Visit the Other ways to study with us page to discover where you can study.</p>' #23.alevel alevel = response.xpath( '//*[@id="fees-and-reqs"]/div[1]/p[3]').extract() alevel = ''.join(alevel) alevel = remove_tags(alevel) # print(alevel) item['alevel'] = alevel item['apply_proces_en'] = apply_proces_en item['university'] = university item['url'] = url item['degree_type'] = degree_type item['degree_name'] = degree_name item['programme_en'] = programme_en item['duration'] = duration item['start_date'] = start_date item['tuition_fee'] = tuition_fee item['ucascode'] = ucascode item['tuition_fee_pre'] = tuition_fee_pre item['overview_en'] = overview_en item['modules_en'] = modules_en item['apply_desc_en'] = apply_desc_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['career_en'] = career_en yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Queen Mary University of London' #2.location location = 'London' #3.department department = response.xpath('//*[@id="count"]/article/div/aside/p[3]/a[1]').extract() department = ''.join(department) department = remove_tags(department) # print(department) #4.programme_en programme_en = response.xpath('//*[@id="count"]/article/header/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_class(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #5.degree_type degree_type = 2 #6.degree_name 7.duration 8.duration_per try: degree_name = response.xpath('//*[@id="count"]/article/header/h2').extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) #print(degree_name) duration = re.findall('\(.*\)',degree_name) duration = ''.join(duration) duration = duration.replace('(','') duration = duration.replace(')','') if 'months' in duration: duration = re.findall('\d',duration)[0] duration_per = 3 else: duration = re.findall('\d',duration)[0] duration_per = 1 # print('duration:',duration) # print('duration_per:',duration_per) if duration in degree_name: degree_name = degree_name.replace(duration,'') degree_name = degree_name.replace('(','') degree_name = degree_name.replace(')', '') degree_name = degree_name.split()[0] else: degree_name = 'N/A' # print(degree_name) except: degree_name = 'N/A' duration = None duration_per = 1 #9.overview_en overview_en = response.xpath('//*[@id="first"]').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) start = overview_en.find('Overview') end = overview_en.find('Why study') overview_en= overview_en[start: end] overview_en = clear_space_str(overview_en) # print(overview_en) #10.teach_time teach_time = 'full time' #11.modules_en try: modules_en = response.xpath('//*[@id="second"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) if 'For more information contact' in modules_en: start = modules_en.find('Structure') end = modules_en.find('For more information contact') modules_en = modules_en[start:end] modules_en = clear_space_str(modules_en) else: modules_en = modules_en # print(modules_en) except: modules_en = 'N/A' #12.assessment_en try: assessment_en = response.xpath('//*[@id="fourth"]').extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) assessment_en = clear_space_str(assessment_en) # print(assessment_en) except: assessment_en = 'N/A' #13.career_en try: career_en = response.xpath('//*[@id="sixth"]').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) career_en = clear_space_str(career_en) # print(career_en) except: career_en = 'N/A' #14.tuition_fee try: tuition_fee1 = response.xpath('//*[@id="fifth"]/p[2]/text()').extract() tuition_fee1 = ''.join(tuition_fee1) tuition_fee1 = remove_tags(tuition_fee1) tuition_fee1 = re.findall('\d{1,3},\d{3}', tuition_fee1) if tuition_fee1 == []: tuition_fee = response.xpath('//*[@id="fifth"]/p[1]/text()').extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = re.findall('\d{1,3},\d{3}', tuition_fee)[0] # print(tuition_fee) else: tuition_fee = tuition_fee1[0] tuition_fee = tuition_fee.replace(',','') # print(tuition_fee) except: tuition_fee = 0 #15.tuition_fee_pre tuition_fee_pre = '£' #16.entry_requirements try: entry_requirements_list = response.xpath('//*[@id="third"]').extract() entry_requirements_list = ''.join(entry_requirements_list) entry_requirements_list = remove_class(entry_requirements_list) # entry_requirements_list = remove_tags(entry_requirements_list) if 'International applicants' in entry_requirements_list: start = entry_requirements_list.find('Entry requirements') mid = entry_requirements_list.find('International applicants') end = entry_requirements_list.find('For more information') entry_requirements = entry_requirements_list[start:mid] other = entry_requirements_list[mid:end] else: entry_requirements = entry_requirements_list other = 'N/A' if 'mso-fareast-language:EN-US' in entry_requirements: start = entry_requirements.findall('Entry requirements') end = 'Normal' entry_requirements = entry_requirements[start:end] else: entry_requirements = entry_requirements entry_requirements = clear_space_str(entry_requirements) other = clear_space_str(other) # print(entry_requirements) #print(other) except: entry_requirements = 'N/A' other = 'N/A' #17.雅思 if department == 'School of Business and Management': ielts=7 ielts_l=5.5 ielts_s=5.5 ielts_r=5.5 ielts_w=6 toefl=100 toefl_l=17 toefl_s=20 toefl_r=18 toefl_w=21 elif department =='School of English and Drama': ielts = 7 ielts_l = 7 ielts_s = 7 ielts_r = 7 ielts_w = 7 toefl = 100 toefl_l = 22 toefl_s = 25 toefl_r = 24 toefl_w = 27 elif department =='School of Geography': ielts = 7 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 6.5 toefl = 100 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 24 elif department=='School of History': ielts = 7 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 6.5 toefl = 100 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 24 elif department =='School of Languages, Linguistics and Film': ielts = 7 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 7 toefl = 100 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 27 elif department=='School of Law': ielts = 7 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 7 toefl = 100 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 24 elif department =='School of Politics and International Relations': ielts = 7 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 6.5 toefl = 100 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 24 else: ielts = 6.5 ielts_l = 5.5 ielts_s = 5.5 ielts_r = 5.5 ielts_w = 6 toefl = 92 toefl_l = 17 toefl_s = 20 toefl_r = 18 toefl_w = 21 # print(ielts,ielts_l,ielts_r,ielts_s,ielts_w) url = response.url apply_documents_en = 'You must provide the following supporting documentation: Completed application form Degree transcripts. Please provide a transcript of your degree(s). If you have not yet completed your degree please provide a transcript of your results achieved to date If your degree was from a UK university, please upload a transcript of your marks for each year If your degree was from an overseas institution, you should supply a transcript of your marks for each year of your studies and a copy of your degree certificate together with a certified translation if the document is not in English. Please note that original documentation will be required before you enrol. International applicants are also advised to include high school transcripts Please provide the contact details of two referees on your application, at least one reference must be from an academic referee who is in a position to comment on the standard of your academic work and suitability for postgraduate level study. Where appropriate, a second referee can provide comment on your professional experience. Your academic referee(s) may already have provided you with a reference that you can use to support any application for study or research that you make. We call these ‘open’ references. Open references will normally only be accepted if they are written on headed paper, provided as a colour copy of the original, and provide the referee’s work contact details. If you have open references, please upload these at the time of application If you do not have open reference, we will contact your referee(s) via email to supply a reference, preferably electronically. Please note, we can only accept references provided by email if it is sent from a university or company email address. References from a personal email address such as Yahoo or Hotmail are not acceptable. Your referee(s) can also supply a paper reference in response to the reference request email your referee will receive. Paper reference forms should be endorsed by an appropriate institution/company stamp or on official institution/company letterhead, and should be provided as a scanned colour copy of the original. Curriculum Vitae (CV)/ Resume This list of documents may vary slightly from course to course, so you will need to check the guidance notes and academic school website for the programme that you are applying for. Although not mandatory, you are encouraged to send in the following documents in support of your application: Statement of purpose Your statement of purpose should explain why you want to study your chosen programme and how it will help your life and career. This should typically be one side of A4 paper. IELTS/TOEFL certificate (if applicable) International applicants should provide evidence of English language ability: IELTS, TOEFL, or other acceptable proof. Please see the international students section for more details.' require_chinese_en = 'Taught degrees (MSc/MA: one year) For entry onto our masters level courses students should normally have achieved: Four-year bachelors degree from 211 or 985 University with 75%+ average Four-year bachelors degree from non-211 University within top 300 with average 80%+ The usual entrance requirement to a taught masters degree is a four-year bachelors degree from a 211 University. However, all applications are considered on an individual basis and students may be admitted to masters programmes with a lower level degree if they have work experience relevant to the degree applied for. Students with a three-year diploma (dazhuan) from a recognised institution may apply for the Pre-Masters Graduate Diploma, a year-long course which will gain them access to a masters programme.Research degree (MPhil/PhD: three years) For entry onto our research degree courses students should normally have a masters degree from a recognised university.' apply_fee = 0 apply_pre = '£' item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['require_chinese_en'] = require_chinese_en item['apply_documents_en'] = apply_documents_en item['university'] = university item['location'] = location item['department'] = department item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['duration'] = duration item['duration_per'] = duration_per item['overview_en'] = overview_en item['teach_time'] = teach_time item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['rntry_requirements'] = entry_requirements item['ielts'] = ielts item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_s'] = toefl_s item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['url'] = url yield item
def parse_data(self, response): item = get_item1(ScrapyschoolEnglandItem1) item['university'] = "University of Bristol" # items['country'] = "England" # items["website"] = "https://www.bristol.ac.uk/" item['url'] = response.url # 授课方式 # item['teach_type'] = 'taught' # 学位类型 item['degree_type'] = 2 print("===========================") print(response.url) try: # 专业 course = response.xpath("//h1[@id='pagetitle']/span//text()").extract() # print("course = ", course) item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip() print("item['programme_en']: ", item['programme_en']) # degreeaward degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract() # print("degreeaward = ", degreeaward) item['degree_name'] = clear_space_str(''.join(degreeaward)) print("item['degree_name']: ", item['degree_name']) if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower(): item['teach_type'] = "phd" if "research" in item['degree_name'].lower(): item['teach_type'] += " " + "research" item['degree_type'] = 3 elif "research" in item['degree_name'].lower(): item['teach_type'] = "research" item['degree_type'] = 3 else: item['teach_type'] = "taught" item['degree_type'] = 2 # print("item['degree_type']: ", item['degree_type']) # print("item['teach_type']: ", item['teach_type']) # duration duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract() clear_space(duration) # print("duration: ", duration) item['teach_time'] = getTeachTime(''.join(duration)) # print("item['teach_time']: ", item['teach_time']) duration_list = getIntDuration(''.join(duration)) if len(duration_list) == 2: item['duration'] = duration_list[0] item['duration_per'] = duration_list[1] # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) # location location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract() # print("location = ", location) item['location'] = clear_space_str(''.join(location)) # print("item['location']: ", item['location']) # startdate startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract() clear_space(startdate) print("startdate = ", startdate) if len(startdate) > 0: # item['start_date'] = startdate[-1].strip() # print("item['start_date']: ", item['start_date']) item['start_date'] = getStartDate(''.join(startdate[-1])) print("item['start_date'] = ", item['start_date']) # deadline deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract() # print("deadline = ", deadline) item['deadline'] = getStartDate(''.join(deadline)) # print("item['deadline']: ", item['deadline']) # department department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract() clear_space(department) # print("department1 = ", department) for d in department: if "School" in d or "Faculty" in d: item['department'] = d # print("item['department']: ", item['department']) if item['department'] == "": allcontent = response.xpath("//main[@class='content']//text()").extract() clear_space(allcontent) department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I) # print("department_re: ", department_re) if len(department_re) > 0: item['department'] = department_re[0].strip() # print("item['department']1: ", item['department']) # overview //div[@id='programme-overview']//text() overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract() item['overview_en'] = remove_class(clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) # tuitionFee //div[@id='fees'] tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract() clear_space(tuitionFee) print("tuitionFee = ", tuitionFee) if len(tuitionFee) > 0: item['tuition_fee_pre'] = "£" item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip()) if item['tuition_fee'] is None: tuitionFee1 = response.xpath( "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract() clear_space(tuitionFee1) print("tuitionFee1 = ", tuitionFee1) if len(tuitionFee1) > 0: item['tuition_fee_pre'] = "£" item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1)) if item['tuition_fee'] == 0: item['tuition_fee_pre'] = "" item['tuition_fee'] = None if item['tuition_fee'] is None: print("tuition_fee 为空") print("item['tuition_fee_pre']: ", item['tuition_fee_pre']) print("item['tuition_fee']: ", item['tuition_fee']) # modules //div[@id='programme-structure'] modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract() item['modules_en'] = remove_class(clear_lianxu_space(modules)) print("item['modules_en']: ", item['modules_en']) # 学术要求本科特殊专业要求、IELTS entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract() item['rntry_requirements'] = clear_lianxu_space(entryRequirements) # print("item['rntry_requirements']: ", item['rntry_requirements']) ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract() item['ielts_desc'] = clear_lianxu_space(ielts) # print("item['ielts_desc']: ", item['ielts_desc']) if item['ielts_desc'] == "Profile A": item['ielts'] = 7.5 item['ielts_l'] = 7.0 item['ielts_s'] = 7.0 item['ielts_r'] = 7.0 item['ielts_w'] = 7.0 item['toefl'] = 109 item['toefl_l'] = 25 item['toefl_r'] = 25 item['toefl_s'] = 25 item['toefl_w'] = 29 elif item['ielts_desc'] == "Profile B": item['ielts'] = 7.0 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.5 item['ielts_w'] = 6.5 item['toefl'] = 100 item['toefl_l'] = 24 item['toefl_r'] = 24 item['toefl_s'] = 24 item['toefl_w'] = 24 elif item['ielts_desc'] == "Profile C": item['ielts'] = 6.5 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.5 item['ielts_w'] = 6.5 item['toefl'] = 92 item['toefl_l'] = 23 item['toefl_r'] = 23 item['toefl_s'] = 23 item['toefl_w'] = 24 elif item['ielts_desc'] == "Profile D": item['ielts'] = 6.5 item['ielts_l'] = 6.0 item['ielts_s'] = 6.0 item['ielts_r'] = 7.0 item['ielts_w'] = 7.0 item['toefl'] = 92 item['toefl_l'] = 21 item['toefl_r'] = 21 item['toefl_s'] = 21 item['toefl_w'] = 27 elif item['ielts_desc'] == "Profile E": item['ielts'] = 6.5 item['ielts_l'] = 6.0 item['ielts_s'] = 6.0 item['ielts_r'] = 6.0 item['ielts_w'] = 6.0 item['toefl'] = 90 item['toefl_l'] = 20 item['toefl_r'] = 20 item['toefl_s'] = 20 item['toefl_w'] = 20 elif item['ielts_desc'] == "Profile F": item['ielts'] = 6.0 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.0 item['ielts_w'] = 6.0 item['toefl'] = 86 item['toefl_l'] = 20 item['toefl_r'] = 20 item['toefl_s'] = 20 item['toefl_w'] = 23 elif "Profile" not in item['ielts_desc']: ieltsDict = get_ielts(item['ielts_desc']) item['ielts'] = ieltsDict.get("IELTS") item['ielts_l'] = ieltsDict.get("IELTS_L") item['ielts_s'] = ieltsDict.get("IELTS_S") item['ielts_r'] = ieltsDict.get("IELTS_R") item['ielts_w'] = ieltsDict.get("IELTS_W") # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % ( # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w'])) # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % ( # item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'])) # 就业 //div[@id='careers'] career = response.xpath("//div[@id='careers']").extract() # print("department = ", department) item['career_en'] = remove_class(clear_lianxu_space(career)) # print("item['career_en']: ", item['career_en']) require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2> <p>You should <a href="/pg-howtoapply/">apply online</a> for all our postgraduate programmes.</p> <p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor’s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p> <ul> <li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li> <li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li> <li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the <a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a> page.</li> </ul>""" item["require_chinese_en"] = remove_class(require_chinese_en) # print("item['require_chinese_en']: ", item['require_chinese_en']) # http://www.bristol.ac.uk/study/postgraduate/apply/ item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p> <p>You can use our online admissions system to:</p> <ul> <li>submit all your application details securely online and view your completed application form;</li> <li>upload supporting documents;</li> <li>request references electronically;</li> <li>track the progress of your application;</li> <li>receive a decision on your application online;</li> <li>update your contact details (it is important you tell us if you change your home address or email);</li> <li>receive useful information about the University and your application.</li> </ul> <p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""])) # print("item['apply_proces_en']: ", item['apply_proces_en']) apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract() item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en)) print("item['apply_documents_en']: ", item['apply_documents_en']) yield item except Exception as e: print("异常:", str(e)) print("报错链接:", response.url) with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f: f.write(str(e) + "\n" + response.url + "\n========================\n")