def parse_fee(self, response): overview = response.meta['overview'] modules = response.meta['modules'] department = response.meta['department'] ielts = response.meta['ielts'] rntry_requirements = response.meta['rntry_requirements'] tuition_fee = getTuition_fee( response.xpath('//section[@class="content"]').extract()) toefl = response.meta['toefl'] # print(tuition_fee) apply_url = response.url.replace('fees', 'applying') yield scrapy.Request(apply_url, meta={ 'overview': overview, 'ielts': ielts, 'toefl': toefl, 'department': department, 'modules': modules, 'rntry_requirements': rntry_requirements, 'tuition_fee': tuition_fee }, callback=self.parse_apply)
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'The University of Adelaide' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en_a = response.xpath( '//*[@id="ua-main-content"]/h2/text()').extract() programme_en_a = ''.join(programme_en_a) programme_en = remove_tags(programme_en_a).replace('Bachelor of', '').strip() if '(' in programme_en: programme_en = re.findall(r'\((.*)\)', programme_en)[0] # print(programme_en) #programme_en 荣誉年 # programme_en_a = response.xpath('//*[@id="ua-main-content"]/h2/text()').extract() # programme_en_a = ''.join(programme_en_a) # programme_en = remove_tags(programme_en_a) # if 'Honours Degree of Bachelor of ' in programme_en: # programme_en = programme_en.replace('Honours Degree of Bachelor of ','') # elif 'Bachelor of 'in programme_en: # programme_en = programme_en.replace('Bachelor of ','') # print(programme_en) #4.degree_type degree_type = 1 #5.degree_name degree_name = programme_en_a # print(degree_name) #6.modules_en modules_en = response.xpath( "//*[contains(text(),'Example Study Plan')]//following-sibling::*" ).extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) if 'table' not in modules_en: modules_en = None # try: # modules_en_url = response.xpath("//*[contains(text(),'Academic Program Rules')]//following-sibling::div//@href").extract()[-1] # except: # modules_en_url = '' # # print(modules_en_url,'**********',url) # print(modules_en) #7.duration #8.duration_per duration_list = response.xpath( '//*[@id="ua-main-content"]/div[2]/div[3]/span[2]').extract() duration_list = ''.join(duration_list) duration_list = remove_tags(duration_list) if '1.5' in duration_list: duration = 1.5 else: try: duration = re.findall('\d', duration_list)[0] except: duration = None duration_per = 1 # print(duration) # print(duration_list) #9.location location = response.xpath( '//*[@id="ua-main-content"]/div[2]/div[1]/span[2]/a').extract() location = ''.join(location) location = remove_tags(location) if '2019/hd' in response.url: location = 'North Terrace Campus' elif len(location) == 0: location = 'Online' # print(location) #10.overview_en overview_en = response.xpath( '//*[@id="ua-main-content"]/div[2]/div/div').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #11.ielts 12131415 ielts_list = response.xpath( '//*[@id="df-acc-admission"]/div[5]/table[2]//tr[2]/td/table//tr/td' ).extract() # ielts_list = ''.join(ielts_list) # ielts_list = remove_tags(ielts_list) # print(ielts_list) #ielts try: if '7' in ielts_list[1]: ielts = 7 else: try: ielts = re.findall('\d\.\d', ielts_list[1])[0] except: ielts = None except: ielts = 6.5 #ielts_r try: if '6.5' in ielts_list[2]: ielts_r = 6.5 else: try: ielts_r = re.findall('\d', ielts_list[2])[0] except: ielts_r = None except: ielts_r = 6 # print(ielts_r) #ielts_l try: if '6.5' in ielts_list[3]: ielts_l = 6.5 else: try: ielts_l = re.findall('\d', ielts_list[3])[0] except: ielts_l = None except: ielts_l = 6 # print(ielts_l) # ielts_s try: if '6.5' in ielts_list[4]: ielts_s = 6.5 else: try: ielts_s = re.findall('\d', ielts_list[4])[0] except: ielts_s = None except: ielts_s = 6 # print(ielts_s) # ielts_w try: if '6.5' in ielts_list[5]: ielts_w = 6.5 else: try: ielts_w = re.findall('\d', ielts_list[5])[0] except: ielts_w = None except: ielts_w = 6 # print(ielts_w) # print(ielts,ielts_r,ielts_w,ielts_s,ielts_l) #16.toefl 17181920 toefl_list = response.xpath( '//*[@id="df-acc-admission"]/div[5]/table[2]//tr[3]/td/table//tr/td' ).extract() toefl_list = ''.join(toefl_list) toefl_list = remove_tags(toefl_list) # print(toefl_list) try: toefl = re.findall('\d+', toefl_list) # print(toefl) a = toefl[0] b = toefl[1] c = toefl[2] d = toefl[3] e = toefl[4] toefl = a toefl_r = b toefl_l = c toefl_s = d toefl_w = e except: toefl = 94 toefl_r = 24 toefl_l = 24 toefl_s = 23 toefl_w = 27 # print(toefl, toefl_r, toefl_l, toefl_s, toefl_w) #21.rntry_requirements_en rntry_requirements_en = response.xpath( '//*[@id="df-acc-admission"]/div[5]/table[3]//tr/td').extract() rntry_requirements_en = ''.join(rntry_requirements_en) rntry_requirements_en = remove_class(rntry_requirements_en) # print(rntry_requirements_en) #22.apply_proces_en apply_proces_en = 'https://international.adelaide.edu.au/admissions/how-to-apply' #23.deadline if 'Medicine and Surgery' in programme_en: deadline = '2018-6-30,2019-5-1' elif 'Dental Surgery' in programme_en: deadline = '2018-6-30,2019-5-1' elif 'Oral Health' in programme_en: deadline = '2018-6-30,2019-5-1' elif 'Nursing' in programme_en: deadline = '2018-9-30,2019-5-1' elif 'Science (Veterinary Bioscience)' in programme_en: deadline = '2018-9-30,2019-5-1' else: deadline = '2018-12-1,2019-5-1' #24.tuition_fee tuition_fee = response.xpath( '//*[@id="df-acc-fees_scholarships"]/div[5]/table//tr/td[2]' ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #25.tuition_fee_pre tuition_fee_pre = '$' #26.apply_pre apply_pre = '$' #27.career_en career_en = response.xpath( '//*[@id="df-acc-careers_parent"]//following-sibling::*').extract( ) career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['duration'] = duration item['duration_per'] = duration_per item['location'] = location item['overview_en'] = overview_en item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['toefl_l'] = toefl_l item['rntry_requirements_en'] = rntry_requirements_en item['apply_proces_en'] = apply_proces_en item['deadline'] = deadline item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['career_en'] = career_en item['apply_pre'] = apply_pre item['modules_en'] = modules_en yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'London Metropolitan University' # print(university) #2.url url = response.url # print(url) #3.ucascode ucascode = response.xpath( "//*[contains(text(),'UCAS code:')]//following-sibling::*" ).extract() ucascode = ''.join(ucascode) ucascode = remove_tags(ucascode) ucascode = clear_space_str(ucascode) # print(ucascode) #4.programme_en programme_en = response.xpath( '//*[@id="MainContent"]/div[1]/h1').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #5.degree_type degree_type = 1 #6.degree_name # degree_name = re.findall(r'-\s(.*)',programme_en)[0] # programme_en = programme_en.replace(degree_name,'').replace('-','').strip() # print(degree_name) # print(programme_en) #7.alevel alevel = response.xpath( '//*[@id="entry-requirements"]/div/ul/li[1]').extract() alevel = ''.join(alevel) alevel = remove_tags(alevel) alevel = clear_space_str(alevel) # print(alevel) #8.overview_en overview_en = response.xpath( '//*[@id="LeftColumn"]/section/p').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = clear_space_str(overview_en) # print(overview_en) #9.start_date start_date = '2018-8-18' #10.apply_pre apply_pre = '£' #11.duration try: duration = response.xpath( "//*[contains(text(),'September 2019 - Full-time')]//@data-duration" ).extract()[0] duration = ''.join(duration) if len(duration) == 0: duration = response.xpath( "//*[contains(text(),'September 2018 - Full-time')]//@data-duration" )[0] duration = ''.join(duration) except: duration = '' # print(duration,response.url) #12.tuition_fee tuition_fee = response.xpath( "//*[contains(text(),'September 2019 - Full-time')]//@data-cost" ).extract() tuition_fee = ''.join(tuition_fee) if len(tuition_fee) == 0: tuition_fee = response.xpath( "//*[contains(text(),'September 2018 - Full-time')]//@data-cost" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #13.location # location = response.xpath("//*[contains(text(),'Location')]//following-sibling::*[1]").extract()[0] # location = ''.join(location) # location = remove_tags(location).replace('Location:','').strip() # print(location) #14.apply_documents_en apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>' #15.modules_en modules_en = response.xpath('//*[@id="modular-structure"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en).replace('▼', '') # modules_en = clear_space_str(modules_en) # print(modules_en,url) #16.assessment_en assessment_en = response.xpath( "//h3[contains(text(),'Assessment')]//following-sibling::p[1]" ).extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # assessment_en = clear_space_str(assessment_en) # print(assessment_en) #17.career_en career_en = response.xpath( '//*[@id="career-opportunities"]/div').extract() career_en = ''.join(career_en) # career_en = clear_space_str(career_en) career_en = remove_class(career_en) # print(career_en) #18.ielts,19.20.21.22 # if 'LLB' in degree_name: # ielts = 6.5 # ielts_r = 6.0 # ielts_l = 6.0 # ielts_w = 6.0 # ielts_s = 6.0 # elif 'BA Translation Year 2 entry' in degree_name: # ielts = 6.5 # ielts_r = 6.0 # ielts_l = 6.0 # ielts_w = 6.0 # ielts_s = 6.0 # elif 'BA Translation Year 3 entry' in degree_name: # ielts = 7 # ielts_r = 6.5 # ielts_l = 6.5 # ielts_w = 6.5 # ielts_s = 6.5 # elif 'BSc Biomedical Science (Leading to MD)' in degree_name: # ielts = 7 # ielts_r = 6.5 # ielts_l = 6.5 # ielts_w = 6.5 # ielts_s = 6.5 # else: # ielts = 6 # ielts_r = 5.5 # ielts_l = 5.5 # ielts_w = 5.5 # ielts_s = 5.5 #23.require_chinese_en require_chinese_en = "https://www.londonmet.ac.uk/international/international-admissions/application-guidance-and-entry-criteria/academic-entry-requirements-by-country/non-eueea-countries/china/" #24.apply_proces_en apply_proces_en = '<p>Stage 1: choosing your course The first step for you as a new applicant is to choose the course you wish to undertake. If you have any questions at this stage you can contact our international recruitment team who will be happy to assist you and provide information about our courses. You can begin a conversation about a course you are interested in by emailing our recruitment team at: [email protected]. We often have representatives of London Metropolitan University visiting countries all around the world. You can find out the latest planned trips to see if we will be visiting near you, here: Meet us overseas Stage 2: applying for your course Once you have decided on your course you need to submit an application as soon as possible making sure you observe the international application deadlines. The method of application depends on the type of course you are applying for. The application methods available for each course are listed on the course page. You should check these details carefully to avoid any delay in your application reaching us.You should observe our international application guidance before submitting an application. Please see here: International application advice Stage 3: awaiting and responding to your offer Once the University receives your application you will receive a communication from us acknowledging this. You will also obtain your London Metropolitan University application ID and details about using, the applicant portal (Evision). At this point your application will enter the pending decision/consideration stage, and we will communicate with you again, either to request more information (such as a qualification transcript, portfolio, or piece of written work) for assessment, or to advise you of our decision.If you are successful in receiving an offer from us you will receive a communication detailing a conditional or unconditional offer, and this will contain further information and instruction. If your application is unsuccessful we will also contact you advising you of this, and our reasons for the decision. You can find out more about offers here: Information and advice for offer holders.Stage 4: Immigration and enrolment After obtaining an unconditional offer you will need to focus on making preparations to join the university and your arrangements to come to the UK (if you are not already here). You will receive further information about when and where to arrive, and how to attend your course enrolment closer to the enrolment period of your course.You should be considering your accommodation and finances as soon as possible before the start of term, and you should also be aware of, and be prepared to meet, any immigration requirements such as obtaining a student visa at the earliest opportunity. You can find a variety of information about moving to London here: Immigration and Arrival Advice: New Students.</p>' #26.tuition_fee_pre tuition_fee_pre = '£' item['apply_pre'] = apply_pre item['apply_documents_en'] = apply_documents_en item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type # item['degree_name'] = degree_name item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['tuition_fee'] = tuition_fee # item['location'] = location item['modules_en'] = modules_en item['assessment_en'] = assessment_en item['career_en'] = career_en # item['ielts'] = ielts # item['ielts_r'] = ielts_r # item['ielts_w'] = ielts_w # item['ielts_s'] = ielts_s # item['ielts_l'] = ielts_l item['require_chinese_en'] = require_chinese_en item['apply_proces_en'] = apply_proces_en item['tuition_fee_pre'] = tuition_fee_pre item['alevel'] = alevel item['ucascode'] = ucascode yield item
def parse_data(self, response): item = get_item1(ScrapyschoolEnglandItem1) item['university'] = "University of Bristol" # items['country'] = "England" # items["website"] = "https://www.bristol.ac.uk/" item['url'] = response.url # 授课方式 # item['teach_type'] = 'taught' # 学位类型 item['degree_type'] = 2 print("===========================") print(response.url) try: # 专业 course = response.xpath("//h1[@id='pagetitle']/span//text()").extract() # print("course = ", course) item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip() print("item['programme_en']: ", item['programme_en']) # degreeaward degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract() # print("degreeaward = ", degreeaward) item['degree_name'] = clear_space_str(''.join(degreeaward)) print("item['degree_name']: ", item['degree_name']) if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower(): item['teach_type'] = "phd" if "research" in item['degree_name'].lower(): item['teach_type'] += " " + "research" item['degree_type'] = 3 elif "research" in item['degree_name'].lower(): item['teach_type'] = "research" item['degree_type'] = 3 else: item['teach_type'] = "taught" item['degree_type'] = 2 # print("item['degree_type']: ", item['degree_type']) # print("item['teach_type']: ", item['teach_type']) # duration duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract() clear_space(duration) # print("duration: ", duration) item['teach_time'] = getTeachTime(''.join(duration)) # print("item['teach_time']: ", item['teach_time']) duration_list = getIntDuration(''.join(duration)) if len(duration_list) == 2: item['duration'] = duration_list[0] item['duration_per'] = duration_list[1] # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) # location location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract() # print("location = ", location) item['location'] = clear_space_str(''.join(location)) # print("item['location']: ", item['location']) # startdate startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract() clear_space(startdate) print("startdate = ", startdate) if len(startdate) > 0: # item['start_date'] = startdate[-1].strip() # print("item['start_date']: ", item['start_date']) item['start_date'] = getStartDate(''.join(startdate[-1])) print("item['start_date'] = ", item['start_date']) # deadline deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract() # print("deadline = ", deadline) item['deadline'] = getStartDate(''.join(deadline)) # print("item['deadline']: ", item['deadline']) # department department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract() clear_space(department) # print("department1 = ", department) for d in department: if "School" in d or "Faculty" in d: item['department'] = d # print("item['department']: ", item['department']) if item['department'] == "": allcontent = response.xpath("//main[@class='content']//text()").extract() clear_space(allcontent) department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I) # print("department_re: ", department_re) if len(department_re) > 0: item['department'] = department_re[0].strip() # print("item['department']1: ", item['department']) # overview //div[@id='programme-overview']//text() overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract() item['overview_en'] = remove_class(clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) # tuitionFee //div[@id='fees'] tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract() clear_space(tuitionFee) print("tuitionFee = ", tuitionFee) if len(tuitionFee) > 0: item['tuition_fee_pre'] = "£" item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip()) if item['tuition_fee'] is None: tuitionFee1 = response.xpath( "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract() clear_space(tuitionFee1) print("tuitionFee1 = ", tuitionFee1) if len(tuitionFee1) > 0: item['tuition_fee_pre'] = "£" item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1)) if item['tuition_fee'] == 0: item['tuition_fee_pre'] = "" item['tuition_fee'] = None if item['tuition_fee'] is None: print("tuition_fee 为空") print("item['tuition_fee_pre']: ", item['tuition_fee_pre']) print("item['tuition_fee']: ", item['tuition_fee']) # modules //div[@id='programme-structure'] modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract() item['modules_en'] = remove_class(clear_lianxu_space(modules)) print("item['modules_en']: ", item['modules_en']) # 学术要求本科特殊专业要求、IELTS entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract() item['rntry_requirements'] = clear_lianxu_space(entryRequirements) # print("item['rntry_requirements']: ", item['rntry_requirements']) ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract() item['ielts_desc'] = clear_lianxu_space(ielts) # print("item['ielts_desc']: ", item['ielts_desc']) if item['ielts_desc'] == "Profile A": item['ielts'] = 7.5 item['ielts_l'] = 7.0 item['ielts_s'] = 7.0 item['ielts_r'] = 7.0 item['ielts_w'] = 7.0 item['toefl'] = 109 item['toefl_l'] = 25 item['toefl_r'] = 25 item['toefl_s'] = 25 item['toefl_w'] = 29 elif item['ielts_desc'] == "Profile B": item['ielts'] = 7.0 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.5 item['ielts_w'] = 6.5 item['toefl'] = 100 item['toefl_l'] = 24 item['toefl_r'] = 24 item['toefl_s'] = 24 item['toefl_w'] = 24 elif item['ielts_desc'] == "Profile C": item['ielts'] = 6.5 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.5 item['ielts_w'] = 6.5 item['toefl'] = 92 item['toefl_l'] = 23 item['toefl_r'] = 23 item['toefl_s'] = 23 item['toefl_w'] = 24 elif item['ielts_desc'] == "Profile D": item['ielts'] = 6.5 item['ielts_l'] = 6.0 item['ielts_s'] = 6.0 item['ielts_r'] = 7.0 item['ielts_w'] = 7.0 item['toefl'] = 92 item['toefl_l'] = 21 item['toefl_r'] = 21 item['toefl_s'] = 21 item['toefl_w'] = 27 elif item['ielts_desc'] == "Profile E": item['ielts'] = 6.5 item['ielts_l'] = 6.0 item['ielts_s'] = 6.0 item['ielts_r'] = 6.0 item['ielts_w'] = 6.0 item['toefl'] = 90 item['toefl_l'] = 20 item['toefl_r'] = 20 item['toefl_s'] = 20 item['toefl_w'] = 20 elif item['ielts_desc'] == "Profile F": item['ielts'] = 6.0 item['ielts_l'] = 6.5 item['ielts_s'] = 6.5 item['ielts_r'] = 6.0 item['ielts_w'] = 6.0 item['toefl'] = 86 item['toefl_l'] = 20 item['toefl_r'] = 20 item['toefl_s'] = 20 item['toefl_w'] = 23 elif "Profile" not in item['ielts_desc']: ieltsDict = get_ielts(item['ielts_desc']) item['ielts'] = ieltsDict.get("IELTS") item['ielts_l'] = ieltsDict.get("IELTS_L") item['ielts_s'] = ieltsDict.get("IELTS_S") item['ielts_r'] = ieltsDict.get("IELTS_R") item['ielts_w'] = ieltsDict.get("IELTS_W") # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % ( # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w'])) # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % ( # item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w'])) # 就业 //div[@id='careers'] career = response.xpath("//div[@id='careers']").extract() # print("department = ", department) item['career_en'] = remove_class(clear_lianxu_space(career)) # print("item['career_en']: ", item['career_en']) require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2> <p>You should <a href="/pg-howtoapply/">apply online</a> for all our postgraduate programmes.</p> <p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor’s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p> <ul> <li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li> <li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li> <li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the <a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a> page.</li> </ul>""" item["require_chinese_en"] = remove_class(require_chinese_en) # print("item['require_chinese_en']: ", item['require_chinese_en']) # http://www.bristol.ac.uk/study/postgraduate/apply/ item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p> <p>You can use our online admissions system to:</p> <ul> <li>submit all your application details securely online and view your completed application form;</li> <li>upload supporting documents;</li> <li>request references electronically;</li> <li>track the progress of your application;</li> <li>receive a decision on your application online;</li> <li>update your contact details (it is important you tell us if you change your home address or email);</li> <li>receive useful information about the University and your application.</li> </ul> <p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""])) # print("item['apply_proces_en']: ", item['apply_proces_en']) apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract() item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en)) print("item['apply_documents_en']: ", item['apply_documents_en']) yield item except Exception as e: print("异常:", str(e)) print("报错链接:", response.url) with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f: f.write(str(e) + "\n" + response.url + "\n========================\n")
def parse(self, response): print(response.url) item = get_item1(ScrapyschoolEnglandItem1) item['university'] = 'Ulster University' item['url'] = response.url item['location'] = 'Belfast' item['teach_time'] = '1' programme = response.xpath('//h1//text()').extract() programme = ''.join(programme).strip() # print(programme) degr = re.findall('-.+', programme) degr = ''.join(degr) # print(degr) programme = programme.replace(degr, '').replace('*', '').strip() degr = degr.replace('-', '').strip() # print(degr) # print(programme) item['programme_en'] = programme item['degree_name'] = degr try: if degr[0] == 'M': item['degree_type'] = '2' elif degr[0] == 'P': item['degree_type'] = '3' except: pass overview = response.xpath( '//h2[contains(text(),"Overview")]/following-sibling::*').extract( ) overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath('//div[@id="modules"]').extract() modules = remove_class(modules) # print(modules) item['modules_en'] = modules rntry = response.xpath('//div[@id="entryconditions"]').extract() rntry = remove_class(rntry) item['rntry_requirements'] = rntry career = response.xpath('//div[@id="opportunities"]').extract() career = remove_class(career) item['career_en'] = career start_date = response.xpath( '//h3[contains(text(),"Start dates")]/following-sibling::*//text()' ).extract() start_date = tracslateDate(start_date) start_date = set(start_date) # print(start_date) start_date = '.'.join(start_date).strip() item['start_date'] = start_date # item['deadline']='2019-6' ielts = response.xpath( '//*[contains(text(),"IELTS")]//text()').extract() ielts = get_ielts(ielts) # print(ielts) try: if ielts != [] or ielts != {}: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] except: pass fee = response.xpath( '//dt[contains(text(),"International:")]/following-sibling::dd/text()' ).extract() tuition_fee = getTuition_fee(fee) # print(tuition_fee) item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = '£' # print(item) yield item
def parse_main(self, response): item = get_item1(ScrapyschoolEnglandItem1) print(response.url) item['teach_time'] = 'fulltime' item['university'] = 'University of Glasgow' item['url'] = response.url item['location'] = 'Glasgow' item['start_date'] = '2018-9' item['deadline'] = '2018-7' item["tuition_fee_pre"] = "£" item['teach_type'] = 'taught' programme = response.xpath( '//div[@id="prog-title"]/h1/text()').extract() programme = ''.join(programme) item['programme_en'] = programme degree_type = response.xpath( '//div[@id="prog-title"]/h1/span/text()').extract() degree_type = ''.join(degree_type) item['degree_name'] = degree_type duration = response.xpath( '//li[contains(text(),"full-time")]/text()').extract() duration = clear_duration(duration) item['duration'] = duration['duration'] item['duration_per'] = duration['duration_per'] # print(durations) overview = response.xpath( '//h2[contains(text(),"Why this programme")]/following-sibling::*' ).extract() overview = remove_class(overview) # print(overview) item['overview_en'] = overview modules = response.xpath( '//h2[contains(text(),"Programme str")]/following-sibling::*' ).extract() modules = clear_same_s(modules) modules = remove_class(modules) item['modules_en'] = modules # print(modules) career = response.xpath( '//h2[contains(text(),"Career")]/following-sibling::*').extract() career = clear_same_s(career) career = remove_class(career) item['career_en'] = career fees = response.xpath( '//h2[contains(text(),"Fees and")]/following-sibling::div//text()' ).extract() fees = response.xpath('//div[@id="fees"]//text()').extract() # print(fees) tuition_fee = getTuition_fee(fees) # print(tuition_fee) if tuition_fee == 2018: tuition_fee = '0' # print(tuition_fee) item['tuition_fee'] = tuition_fee IELTS = response.xpath( '//*[contains(text(),"IELTS")]/../following-sibling::ul[1]//text()' ).extract() # print(IELTS) ielts = get_ielts(IELTS) if ielts != {} and ielts != []: item['ielts_l'] = ielts['IELTS_L'] item['ielts_s'] = ielts['IELTS_S'] item['ielts_r'] = ielts['IELTS_R'] item['ielts_w'] = ielts['IELTS_W'] item['ielts'] = ielts['IELTS'] TOEFL = response.xpath( '//*[contains(text(),"TOEFL")]/..//text()').extract() # print(TOEFL) toefl = get_toefl(TOEFL) if toefl != []: try: item['toefl_r'] = toefl[1] item['toefl_l'] = toefl[2] item['toefl_s'] = toefl[3] item['toefl_w'] = toefl[4] item['toefl'] = toefl[0] except: pass entry = response.xpath( '//h2[contains(text(),"Entry requirements")]/following-sibling::*' ).extract() entry = clear_same_s(entry) entry = remove_class(entry) item['rntry_requirements'] = entry apply_d = response.xpath( '//h3[contains(text(),"Documents")]/following-sibling::ul[1]' ).extract() apply_d = clear_same_s(apply_d) item['apply_proces_en'] = remove_class(apply_d) if programme != '': yield item
def parse(self, response): pass item = get_item1(ScrapyschoolEnglandItem1) #1.university university = 'Edge Hill University' # print(university) #2.url url = response.url # print(url) #3.programme_en programme_en = response.xpath('//*[@id="primary"]/header/h1/a').extract() programme_en = ''.join(programme_en) programme_en = remove_tags(programme_en) # print(programme_en) #4.degree_type degree_type = 2 #5.degree_name degree_name = programme_en.split()[0] programme_en = programme_en.replace(degree_name,'').strip() # print(degree_name) # print(programme_en) #6.teach_time #7.duration #8.duration_per teach_time_list = response.xpath("//*[contains(text(),'Length:')]//following-sibling::*").extract() teach_time_list= ''.join(teach_time_list) teach_time_list = remove_tags(teach_time_list) # print(teach_time_list) duration = re.findall('\d+',teach_time_list)[0] if 'Months' in teach_time_list: duration_per = 3 elif 'Weeks' in teach_time_list: duration_per = 4 else: duration_per = 1 if 'Full-Time' in teach_time_list: teach_time = 'Full-Time' else: teach_time = 'Part-Time' # print(duration,'***********',duration_per) # print(teach_time) #9.start_date start_date = response.xpath("//*[contains(text(),'Dates:')]//following-sibling::*").extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = tracslateDate(start_date) start_date = ','.join(start_date) # print(start_date) #10.department department = response.xpath("//*[contains(text(),'Department:')]//following-sibling::*").extract() department = ''.join(department) department = remove_tags(department) # print(department) #11.location location = response.xpath("//*[contains(text(),'Location:')]//following-sibling::*").extract() location = ''.join(location) location = remove_tags(location) # print(location) #12.overview_en overview_en = response.xpath('//*[@id="overview"]/div[1]/div/ul/li/text()').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) overview_en = '<p>' + overview_en +'</p>' # print(overview_en) #13.assessment_en assessment_en = response.xpath("//*[contains(text(),'How will I be assessed?')]//following-sibling::*[1]").extract() assessment_en = ''.join(assessment_en) assessment_en = remove_class(assessment_en) # print(assessment_en) #14.modules_en modules_en = response.xpath('//*[@id="modules"]/h4/strong').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #15.rntry_requirements rntry_requirements = response.xpath("//*[contains(text(),'Entry Requirements')]//following-sibling::*").extract() rntry_requirements = ''.join(rntry_requirements) rntry_requirements = remove_class(rntry_requirements) # print(rntry_requirements) #16.ielts 17.18.19.20 ielts_list = response.xpath("//*[contains(text(),'English Language Requirements')]//following-sibling::*[1]").extract() ielts_list = ''.join(ielts_list) ielts_list = remove_tags(ielts_list) # print(ielts_list) try: ielts = re.findall('\d\.\d',ielts_list) except: ielts = None if len(ielts) ==1: a = ielts[0] ielts = a ielts_r = a ielts_w = a ielts_s = a ielts_l = a elif len(ielts) ==2: a= ielts[0] b= ielts[1] ielts = a ielts_r = b ielts_w = b ielts_s = b ielts_l = b else: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_s = 6.0 ielts_l = 6.0 # print(ielts,ielts_r,ielts_w,ielts_l,ielts_s) #21.career_en career_en = response.xpath("//*[contains(text(),'What are my career prospects?')]//following-sibling::*").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #22.tuition_fee tuition_fee= response.xpath("//*[contains(text(),'Tuition Fees')]//following-sibling::*").extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = getTuition_fee(tuition_fee) # print(tuition_fee) #23.tuition_fee_pre tuition_fee_pre= '£' #24.apply_proces_en apply_proces_en = response.xpath("//h4[contains(text(),'How to Apply')]//following-sibling::*").extract() apply_proces_en = ''.join(apply_proces_en) apply_proces_en = remove_class(apply_proces_en) # print(apply_proces_en) #25.apply_pre apply_pre = '£' item['apply_pre'] = apply_pre item['university'] = university item['url'] = url item['programme_en'] = programme_en item['degree_type'] = degree_type item['degree_name'] = degree_name item['teach_time'] = teach_time item['duration'] = duration item['duration_per'] = duration_per item['start_date'] = start_date item['department'] = department item['location'] = location item['overview_en'] = overview_en item['assessment_en'] = assessment_en item['modules_en'] = modules_en item['rntry_requirements'] = rntry_requirements item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_proces_en'] = apply_proces_en yield item