def parse(self, response): item = getItem.get_item(ScrapyschoolCanadaCollegeItem) abc = response.xpath("//li/h3/a/@href").extract() print(abc) item['school_name'] = None #item['location'] = location # item['campus'] = campus item['department'] = None item['degree_name'] = None item['degree_name_desc'] = None item['major_name_en'] = None # item['programme_code'] = programme_code item['overview_en'] = None item['start_date'] = None item['duration'] = None item['duration_per'] = '1' item['modules_en'] = None item['career_en'] = None item['deadline'] = None item['apply_pre'] = 'CAD$' item['apply_fee'] = None item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = None item['tuition_fee_per'] = '1' item['entry_requirements_en'] = None item['require_chinese_en'] = None item['specific_requirement_en'] = None item['average_score'] = None item['gaokao_desc'] = None item['gaokao_zs'] = None item['huikao_desc'] = None item['huikao_zs'] = None item['ielts_desc'] = None item['ielts'] = None item['ielts_l'] = None item['ielts_s'] = None item['ielts_r'] = None item['ielts_w'] = None item['toefl_code'] = None item['toefl_desc'] = None item['toefl'] = None item['toefl_l'] = None item['toefl_s'] = None item['toefl_r'] = None item['toefl_w'] = None item['interview_desc_en'] = None item['portfolio_desc_en'] = None item['other'] = None item['url'] = response.url item['degree_level'] = None
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Southern Alberta Institute of Technology" item['url'] = response.url print("===========================") print(response.url) # item['campus'] = 'Antigonish' item['location'] = '1301-16 Avenue NW Calgary AB, T2M 0L4' item['other'] = """问题描述: 1.没有校区和专业代码""" # https://www.sait.ca/admissions/admission-and-selection/english-proficiency item['ielts_desc'] = '6.0 in each skill/category' item['ielts'] = '6.0' item['ielts_l'] = '6.0' item['ielts_s'] = '6.0' item['ielts_r'] = '6.0' item['ielts_w'] = '6.0' item['toefl_desc'] = 'A minimum score of 20 in each category' item['toefl'] = None item['toefl_l'] = '20' item['toefl_s'] = '20' item['toefl_r'] = '20' item['toefl_w'] = '20' # https://www.sait.ca/admissions/admission-and-selection/international-document-assessment item['require_chinese_en'] = """<div> <span>China</span> <span></span> <div> <table> <tbody> <tr> <td> <p>We require precise, word-for-word English translations. Documents can be translated by a Canadian certified translator, Immigrant Services Calgary, your institution, or any other professional translation service in your country.</p> <p>Documents from China need to be verified by China Higher Education Student Information and Career Center (CHESICC: 学信网) or China Academic Degrees & Graduation Education Information (CDGDC: 学位网). Documents can be translated by one of these verification services.</p> <p><strong>Original secondary school documents required:</strong></p> <ul> <li>Senior High School Graduation Diploma (高中毕业证书)</li> <li>Senior High School transcript (成绩单)</li> <li>Verification report ( 认证报告 ) from CHESICC (学信网) or CDGDC (学位网)</li> </ul> <p><strong>Original post-secondary documents required:</strong></p> <ul> <li> Zhuanke ( 专科 ): <ul> <li>Graduation certificate (毕业证书)</li> <li>Academic transcript (成绩单)</li> <li>Verification report (认证报告) from CHESICC (学信网) or CDGDC (学位网)</li> </ul> </li> </ul> <ul> <li>Benke or higher (本科及以上学历): <ul> <li>Graduation certificate (毕业证书)</li> <li>Degree certificate (学位证书)</li> <li>Academic transcript (成绩单)</li> <li>Verification report (认证报告) from CHESICC (学信网) or CDGDC (学位网)</li> </ul> </li> </ul> </td> </tr> </tbody> </table> </div> </div> """ # item['deadline'] = '2019-08-01' item['apply_pre'] = "CAD$" """You will need a Visa or Mastercard to pay the $75 online application fee. A hard-copy application is available — the application fee is $175. Application fees are non-refundable.""" item['apply_fee'] = '75' try: major_name_en = response.xpath("//div[@class='middle g-text-center']/h1//text()").extract() clear_space(major_name_en) item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) # //span[contains(text(),'Credential:')]/.. degree_name = response.xpath("//span[contains(text(),'Credential:')]/../text()").extract() clear_space(degree_name) item['degree_name'] = ''.join(degree_name).strip() print("item['degree_name']: ", item['degree_name']) if item['degree_name'] == "Diploma": item['degree_level'] = 3 if "Bachelor" in item['degree_name']: item['degree_level'] = 1 if "Post" in item['degree_name']: item['degree_level'] = 2 print("item['degree_level']: ", item['degree_level']) if item['degree_level'] is not None: duration = response.xpath("//span[contains(text(),'Length:')]/../text()").extract() clear_space(duration) print("duration: ", duration) # 判断课程长度单位 if "year" in ''.join(duration).lower(): item['duration_per'] = 1 if "month" in ''.join(duration).lower(): item['duration_per'] = 3 if "week" in ''.join(duration).lower(): item['duration_per'] = 4 duration_re = re.findall(r"\d+", ''.join(duration)) # print("duration_re: ", duration_re) item['duration'] = ''.join(duration_re).strip() # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) # //div[@class='col-1of2']//table[@class='g-table g-table-striped']//td[2] start_date = response.xpath("//div[@class='col-1of2']//table[@class='g-table g-table-striped']//td[2]//text()").extract() clear_space(start_date) print("start_date: ", start_date) monthDict = {"january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06", "july": "07", "august": "08", "september": "09", "october": "10", "november": "11", "december": "12", "jan": "01", "feb": "02", "mar": "03", "apr": "04", "jun": "06", "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12", "sept": "09", } start_date_str = "" if len(start_date) > 0: for sta in start_date: month_re = re.findall(r"[A-Za-z]+", sta) day_re = re.findall(r"\d+,", sta) year_re = re.findall(r"\d{4}", sta) # print(monthDict.get(''.join(month_re).lower().strip())) if monthDict.get(''.join(month_re).lower().strip()) is not None: start_date_str += ''.join(year_re) + "-" + monthDict.get(''.join(month_re).lower().strip()) + "-0" + ''.join(day_re) item['start_date'] = start_date_str.strip().strip(",").strip() print("item['start_date']: ", item['start_date']) overview = response.xpath( "//h3[contains(text(),'Majors')]/preceding-sibling::*[position()<last()]").extract() if len(overview) == 0: overview = response.xpath("//h3[contains(text(),'Your Career')]/preceding-sibling::*[position()<last()]|" "//h3[contains(text(),'Your career')]/preceding-sibling::*[position()<last()]").extract() if len(overview) > 0: item['overview_en'] = remove_class(clear_lianxu_space(overview)) print("item['overview_en']: ", item['overview_en']) career_key1 = r"<h3>Your Career</h3>" if career_key1 not in response.text: career_key1 = '<h3 style="text-align: left;">Your Career</h3>' if career_key1 not in response.text: career_key1 = '<h3>Your career</h3>' career_key2 = r"<h3>Student Success</h3>" if career_key2 not in response.text: career_key2 = '<h3 style="text-align: left;">Student Success</h3>' if career_key2 not in response.text: career_key2 = '<h3><a name="CET-Success"></a>Student Success</h3>' if career_key1 in response.text and career_key2 in response.text: item['career_en'] = remove_class(getContentToXpath(response.text, career_key1, career_key2)) # print("item['career_en']: ", item['career_en']) # //div[@id='admission_requirements']//div[@class='g-section g-container-sm'] entry_requirements_en = response.xpath( "//div[@id='admission_requirements']//div[@class='g-section g-container-sm']").extract() if len(entry_requirements_en) > 0: item['entry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements_en)) # print("item['entry_requirements_en']: ", item['entry_requirements_en']) modules_en = response.xpath( "//div[@id='courses']/div[@class='g-section g-container-sm']").extract() if len(modules_en) > 0: item['modules_en'] = remove_class(clear_lianxu_space(modules_en)) # print("item['modules_en']: ", item['modules_en']) tuition_fee = response.xpath("//h3[contains(text(),'International Tuition and Fees*')]/following-sibling::table//tr[1]/td[2]//text()").extract() clear_space(tuition_fee) item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = ''.join(tuition_fee).replace("$", "").strip() item['tuition_fee_per'] = 1 # print("item['tuition_fee_pre']: ", item['tuition_fee_pre']) # print("item['tuition_fee']: ", item['tuition_fee']) department = response.xpath("//h2[contains(text(),'Contact Information')]/following-sibling::p/strong//text()").extract() item['department'] = ''.join(department).strip() # print("item['department']: ", item['department']) # 特殊学位分专业 major_list_table = response.xpath( "//div[@id='program_details']/div[@class='g-section g-container-sm']/table[@class='g-table g-table-striped'][1]/tbody/tr/td[1]/strong//text()").extract() if len(major_list_table) > 0: print("major_list_table: ", major_list_table) for major_list in major_list_table: item['major_name_en'] = major_list.strip() # print("major: ", major) yield item else: yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write(str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = getItem.get_item(ScrapyschoolCanadaCollegeItem) try: major_name_en = response.xpath('//h1').extract()[0] major_name_en = remove_tags(major_name_en) if '(' in major_name_en: major_name_en = re.findall('(.* )\(.*\)', major_name_en)[0] else: pass #major_name_en = major_name_en.replace('\r\n','').replace(' ','').replace(' ','') # print(major_name_en) except: major_name_en = None # print(major_name_en) #programme_code try: programme_code = response.xpath('').extract()[0] programme_code = remove_tags(programme_code) except: programme_code = None try: duration = response.xpath( '//span[contains(text(),"Length")]/following-sibling::span' ).extract()[0] duration = remove_tags(duration) duration_per = '2' if 'Eight semesters' in duration: duration = '8' elif 'Four semesters' in duration: duration = '4' elif 'Three semesters' in duration: duration = '3' elif 'One year' in duration: duration = '1' duration_per = '1' elif 'Two semesters' in duration: duration = '2' elif 'Two years' in duration: duration = '2' duration_per = '1' elif 'One semester' in duration: duration = '1' elif 'Six semesters' in duration: duration = '6' elif 'Five semesters' in duration: duration = '5' elif 'Self-paced' in duration: duration = 'No' else: duration_per = '1' #print(duration) except: duration = None duration_per = None #print(duration) #1.学校名称 school_name = 'Douglas College' #2.地点 try: location = response.xpath( '//span[contains(text(),"Campu")]/following-sibling::span' ).extract()[0] location = remove_tags(location) # print(location) except: location = None # print(location) #3. 校区 try: campus = location # campus = remove_tags(campus) # campus = campus.replace(', Online','') # campus = campus.replace(' ','') # campus = campus.split(',') #print(campus) except: campus = None #print(campus) #4. 学院 try: department = response.xpath( '//span[contains(text(),"Faculty:")]/following-sibling::span' ).extract()[0] department = remove_tags(department, keep=("i")) department = department.replace('&', '') #print(len(department)) # print(department) #print(response.url) except: department = None # print(department) # 4. try: degree_name = response.xpath( '//span[contains(text(),"Credential:")]/following-sibling::span' ).extract()[0] #degree_name_list = remove_tags(degree_name_list,keep=('li','ul')) #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list) #degree_name_list = degree_name_list.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','') # degree_name_list = degree_name_list.replace('<li>','').replace('</li>','---') # degree_name_list = degree_name_list.replace('<span>','').replace('</span>','---') # degree_name_list = degree_name_list.split('</li><li>') degree_name = remove_tags(degree_name) if 'Bachelor\'s Degree' in degree_name: degree_level = '1' elif 'Associate Degree' in degree_name: degree_level = '4' elif 'Advanced Certificate' in degree_name: degree_level = '4' elif 'Post-Degree Diploma' in degree_name or 'Post-Baccalaureate Diploma' in degree_name or 'Graduate Diploma' in degree_name: degree_level = '2' elif 'Diploma' in degree_name: degree_level = '3' else: degree_level = None #print(degree_name) #print(response.url) except: degree_name = None # print(degree_name) # #5.学位描述 try: degree_overview_en = response.xpath( '//*[@id="overview"]/div').extract() degree_overview_en = ''.join(degree_overview_en) #degree_overview_en = remove_tags(degree_overview_en) degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', degree_overview_en) #degree_overview_en = degree_overview_en.replace('\r\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace(' ',' ') #degree_overview_en = degree_overview_en.replace(' ','') #degree_overview_en = degree_overview_en.replace(' ','') # print(degree_overview_en) except: degree_overview_en = None # print(degree_overview_en) #6.专业英文 #7.专业介绍 try: #overview_en = degree_overview_en overview_en = degree_overview_en # print(overview_en) except: overview_en = degree_overview_en # print(overview_en) #8.入学时间 try: start_date = response.xpath( '//span[contains(text(),"Offered:")]/following-sibling::span' ).extract()[0] #start_date = ','.join(start_date) start_date = remove_tags(start_date) # start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','') # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01') start_date = start_date.replace('Winter', '2019-01').replace( 'Spring', '2019-01').replace('Summer', '2019-05').replace('Fall', '2019-09') # print(start_date) except: start_date = None # print(start_date) #9.课程长度 # try: # duration = response.xpath('').extract()[0] # duration = remove_tags(duration) # # print(duration) # except: # duration = None # # print(duration) #10.课程设置 try: modules_en = response.xpath( '//span[contains(text(),"curriculum framework")]/following-sibling::div' ).extract() modules_en = ''.join(modules_en) modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en) modules_en = modules_en.replace('\r\n', '').replace( '\n', '' ).replace('\t', '').replace( ' ', '' ).replace( ' ', '') #print(modules_en) except: modules_en = None #print(modules_en) #11.就业方向 try: career_en = response.xpath( '//span[contains(text(),"career transfer pathways")]/following-sibling::div' ).extract()[0] career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en) career_en = career_en.replace('\r\n', '').replace( '\n', '' ).replace('\t', '').replace( ' ', '' ).replace( ' ', '') # print(career_en) except: career_en = None # print(career_en) #12.截止日期 try: if '2019-01' in start_date and '2019-05' in start_date and '2019-09' in start_date: deadline = '2019-05-31,2018-09-30,2019-01-30' elif '2019-01' in start_date and '2019-05' in start_date: deadline = '2018-09-30,2019-01-30' elif '2019-05' in start_date and '2019-09' in start_date: deadline = '2019-01-30,2019-05-31' elif '2019-01' in start_date and '2019-09' in start_date: deadline = '2018-09-30,2019-05-31' elif '2019-01' in start_date: deadline = '2018-09-30' elif '2019-05' in start_date: deadline = '2019-01-30' elif '2019-09' in start_date: deadline = '2019-05-31' else: deadline = None #print(deadline) except: deadline = None #print(deadline) #13.学费 try: tuition_fee = '692.56' tuition_fee = remove_tags(tuition_fee) tuition_fee = tuition_fee.replace('$', '') #print(tuition_fee) except: tuition_fee = None #print(tuition_fee) #14 申请费: apply_fee = '90' try: entry_requirements_en = '<ul><li>Undergraduate (diploma and associate degree) programs:<ul><li>High school graduation OR minimum 17 years of age by the end of the first month of studies in the semester of entry to Douglas College; and</li><li>Minimum overall grade average of 60% (or equivalent) in the final year of high school; and</li><li>Minimum final grade of “C” or 60% (or equivalent) in Grade 11 Mathematics or equivalent for most programs</li></ul></li></ul><ul><li>Post-Graduate (Post-Degree and Post-Baccalaureate) programs:<br><ul><li>Graduation from a recognized degree granting post-secondary institution with a minimum 3-year bachelor degree; and</li><li>Minimum cumulative grade point average of 60% (or equivalent) during the bachelor degree program</li></ul></li></ul>' #entry_requirements_en = remove_tags(entry_requirements_en) #print(entry_requirements_en) #print(abc) except: entry_requirements_en = None #16 中国学生申请要求 try: require_chinese_en = entry_requirements_en #require_chinese_en = remove_tags(require_chinese_en) # print(require_chinese_en) except: require_chinese_en = None # print(require_chinese_en) #17 特殊专业要求 try: specific_requirement_en = response.xpath('').extract()[0] # #specific_requirement_en = remove_tags(specific_requirement_en) # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en) # specific_requirement_en = specific_requirement_en.replace('\r\n','') # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0] # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul")) # #print(specific_requirement_en) except: specific_requirement_en = None #print(specific_requirement_en) #18 高考(官网要求) try: gaokao_desc = response.xpath('').extract()[0] gaokao_desc = remove_tags(gaokao_desc) # print(gaokao_desc) except: gaokao_desc = None # print(gaokao_desc) #19 高考(展示以及判断字段) try: gaokao_zs = response.xpath('').extract()[0] gaokao_zs = remove_tags(gaokao_zs) # print(gaokao_zs) except: gaokao_zs = None # print(gaokao_zs) #22 会考描述 try: huikao_desc = response.xpath('').extract()[0] huikao_desc = remove_tags(huikao_desc) # print(huikao_desc) except: huikao_desc = None # print(huikao_desc) #23 会考描述 try: huikao_zs = response.xpath('').extract()[0] huikao_zs = remove_tags(huikao_zs) # print(huikao_zs) except: huikao_zs = None # print(huikao_zs) #25 雅思要求 try: ielts_desc = 'English Requirements for ELLA placement You must submit proof of ONE of the following: TOEFL - minimum score 45 IBT ((Douglas College only accepts an official TOEFL score. Douglas College\'s institutional code is 9568). IELTS – minimum score 4.5, no band below 4.5' #ielts_desc = remove_tags(ielts_desc) # print(ielts_desc) except: ielts_desc = None # print(ielts_desc) #26 ielts try: ielts = '4.5' #ielts = re.findall('\d\.\d',ielts) #ielts = remove_tags(ielts) #print(ielts) except: ielts = None #print(ielts) #27 ielts_? ielts_l = 4.5 ielts_s = 4.5 ielts_r = 4.5 ielts_w = 4.5 #28 toefl_code try: toefl_code = '9568' #toefl_code = remove_tags(toefl_code) # print(toefl_code) except: toefl_code = None # print(toefl_code) #29 toefl_desc try: toefl_desc = 'English Requirements for ELLA placement You must submit proof of ONE of the following: TOEFL - minimum score 45 IBT ((Douglas College only accepts an official TOEFL score. Douglas College\'s institutional code is 9568). IELTS – minimum score 4.5, no band below 4.5' #toefl_desc = remove_tags(toefl_desc) # print(toefl_desc) except: toefl_desc = None # print(toefl_desc) #30 toefl try: toefl = '45' #toefl = re.findall('\d\d',toefl) #toefl = remove_tags(toefl) #print(toefl) except: toefl = None # print(toefl) #31 toefl_? toefl_l = None toefl_s = None toefl_r = None toefl_w = None #34 ap try: ap = None ap = remove_tags(ap) # print(ap) except: ap = None # print(ap) #35 面试描述 try: interview_desc_en = response.xpath('').extract()[0] interview_desc_en = remove_tags(interview_desc_en) # print(interview_desc_en except: interview_desc_en = None # print(interview_desc_en) #36 作品集描述 try: portfolio_desc_en = response.xpath('').extract()[0] portfolio_desc_en = remove_tags(portfolio_desc_en) # print(portfolio_desc_en) except: portfolio_desc_en = None # print(portfolio_desc_en) #37 other try: other = 'IB:Minimum grade of 3 or C ap:Minimum grade of 3 or C ' #other = remove_tags(other) # print(other) except: other = None # print(other) #平均分 average_score average_score = '60' # degree_name_desc try: degree_name_desc = overview_en except: degree_name_desc = None ap = 'Minimum grade of 3 or C ' item['school_name'] = school_name item['location'] = location item['campus'] = campus item['department'] = department item['degree_name'] = degree_name item['degree_name_desc'] = degree_name_desc item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['duration_per'] = duration_per item['modules_en'] = modules_en item['career_en'] = career_en item['deadline'] = deadline item['apply_pre'] = 'CAD$' item['apply_fee'] = apply_fee item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = tuition_fee item['tuition_fee_per'] = '5' item['entry_requirements_en'] = entry_requirements_en item['require_chinese_en'] = require_chinese_en item['specific_requirement_en'] = specific_requirement_en item['average_score'] = average_score item['gaokao_desc'] = gaokao_desc item['gaokao_zs'] = gaokao_zs item['huikao_desc'] = huikao_desc item['huikao_zs'] = huikao_zs item['ielts_desc'] = ielts_desc item['ielts'] = ielts item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['toefl_code'] = toefl_code item['toefl_desc'] = toefl_desc item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_s'] = toefl_s item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['interview_desc_en'] = interview_desc_en item['portfolio_desc_en'] = portfolio_desc_en item['other'] = other item['url'] = response.url item['degree_level'] = degree_level # # yield item
def parse_data(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Red River College" item['url'] = response.url print("===========================") print(response.url) # item['campus'] = '' # item['location'] = '11762 - 106 Street Edmonton, Alberta, Canada, T5G 2R1' item['other'] = """问题描述: 1.就业空的是页面没有的""" # item['require_chinese_en'] = '' # https://www.rrc.ca/future-students/fees/#application item['apply_pre'] = "CAD$" item['apply_fee'] = '120' try: major_name_en = response.xpath( "//span[@id='_ctl0_lblProgramName']//text()").extract() clear_space(major_name_en) item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) degree_name = response.xpath( "//span[@id='_ctl0_ContentPlaceHolder1_lblDescriptionText']//ul/li[1]//text()" ).extract() clear_space(degree_name) print("degree_name: ", degree_name) duration_str = "" if len(degree_name) > 0: if "advanced diploma" in degree_name[0].lower(): item['degree_name'] = "advanced diploma" item['degree_level'] = 3 elif "diploma" in degree_name[0].lower(): item['degree_name'] = "diploma" item['degree_level'] = 3 elif "degree" in degree_name[0].lower(): item['degree_name'] = "Bachelor degree" item['degree_level'] = 1 duration_str = degree_name[0] print("item['degree_name']: ", item['degree_name']) print("item['degree_level']: ", item['degree_level']) if item['degree_level'] is not None: duration_re = re.findall(r"\w+\-year|[\w\W\-]+month", duration_str) print("duration_re: ", duration_re) # 判断课程长度单位 if len(duration_re) > 0: if "year" in ''.join(duration_re[0]).lower(): item['duration_per'] = 1 if "month" in ''.join(duration_re[0]).lower(): item['duration_per'] = 3 if "week" in ''.join(duration_re[0]).lower(): item['duration_per'] = 4 d_dict = { "One": "1", "Two": "2", "Three": "3", "Four": "4", "Five": "5", "Six": "6", "Seven": "7", "Eight": "8", "Nine": "9", "Ten": "10", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", } if len(duration_re) > 0: item['duration'] = d_dict.get(''.join( duration_re[0]).replace("year", "").replace("-", "").strip()) if item['duration'] is None: item['duration'] = duration_re[0].replace( "month", "").replace("-", "").strip() if item['duration'] is None: item['duration'] = duration_re[0].strip() print("item['duration']: ", item['duration']) print("item['duration_per']: ", item['duration_per']) overview = response.xpath( "//b[contains(text(),'Description')]/..|//strong[contains(text(),'Description')]/..|" "//strong[contains(text(),'Description')]/../following-sibling::*" ).extract() if len(overview) == 0: overview = response.xpath( "//u[contains(text(),'Description')]/../..|//u[contains(text(),'Description')]/../../following-sibling::*|" "//b//span[@lang='EN-US']/../..|//b//span[@lang='EN-US']/../../following-sibling::*|" "//span[contains(text(),'Description')]/../..|//span[contains(text(),'Description')]/../../following-sibling::*|" "//span[@id='_ctl0_ContentPlaceHolder1_lblDescriptionText']//p//strong//em/../..|//span[@id='_ctl0_ContentPlaceHolder1_lblDescriptionText']//p//strong//em/../../following-sibling::*" ).extract() if len(overview) > 0: item['overview_en'] = remove_class( clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) entry_requirements_en_url = response.xpath( "//a[@id='_ctl0_rProgramMenu__ctl1_MenuLink']/@href" ).extract() # print("entry_requirements_en_url: ", entry_requirements_en_url) if len(entry_requirements_en_url) > 0: item[ 'entry_requirements_en'] = self.parse_entry_requirements_en( "https://me.rrc.mb.ca/Catalogue/" + entry_requirements_en_url[0]) # if "'" in item['entry_requirements_en']: # item['entry_requirements_en'] = item['entry_requirements_en'].strip("'") ielts_list = self.parse_ielts( "https://me.rrc.mb.ca/Catalogue/" + entry_requirements_en_url[0]) if len(ielts_list) > 0: if "," in ielts_list[1]: ielts_split = ielts_list[1].split(',') for ie in ielts_split: if "Listening" in ie: item['ielts_l'] = ie.replace( "Listening", "").strip() if "Speaking" in ie: item['ielts_s'] = ie.replace( "Speaking", "").strip() if "Reading" in ie: item['ielts_r'] = ie.replace( "Reading", "").strip() if "Writing" in ie: item['ielts_w'] = ie.replace( "Writing", "").strip() print("item['ielts_l']: ", item['ielts_l']) print("item['ielts_s']: ", item['ielts_s']) print("item['ielts_r']: ", item['ielts_r']) print("item['ielts_w']: ", item['ielts_w']) print("item['entry_requirements_en']: ", item['entry_requirements_en']) modules_url = response.xpath( "//a[contains(text(), 'Courses and Descriptions')]/@href" ).extract() # print("modules_url: ", modules_url) if len(modules_url) > 0: item['modules_en'] = self.parse_modules( "https://me.rrc.mb.ca/Catalogue/" + modules_url[0]) # print("item['modules_en']: ", item['modules_en']) career_url = response.xpath( "//a[contains(text(), 'Employment Potential')]/@href" ).extract() # print("career_url: ", career_url) if len(career_url) > 0: item['career_en'] = self.parse_entry_requirements_en( "https://me.rrc.mb.ca/Catalogue/" + career_url[0]) # print("item['career_en']: ", item['career_en']) location_date_tuition_url = response.xpath( "//a[contains(text(), 'Locations, Dates and Fees')]/@href" ).extract() print("location_date_tuition_url: ", location_date_tuition_url) item['tuition_fee_per'] = "1" item['tuition_fee_pre'] = "CAD$" if len(location_date_tuition_url) > 0: print("--------0-------") location_date_tuition_dict = self.parse_location_date_tuition( "https://me.rrc.mb.ca/Catalogue/" + location_date_tuition_url[0]) item['tuition_fee'] = location_date_tuition_dict.get( 'tuition_fee') campus_list = location_date_tuition_dict.get( 'campus_list') start_date_list = location_date_tuition_dict.get( 'start_date_list') monthDict = { "january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06", "july": "07", "august": "08", "september": "09", "october": "10", "november": "11", "december": "12", "jan": "01", "feb": "02", "mar": "03", "apr": "04", "jun": "06", "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12", "sept": "09", } if len(start_date_list) > 0: # 将每个日期转成需要的日期格式 for sta in range(len(start_date_list)): month_re = re.findall(r"[A-Za-z]+", start_date_list[sta]) day_re = re.findall(r"\d+,", start_date_list[sta]) year_re = re.findall(r"\d{4}", start_date_list[sta]) # print(monthDict.get(''.join(month_re).lower().strip())) if monthDict.get(''.join( month_re).lower().strip()) is not None: start_date_list[sta] = ''.join( year_re) + "-" + monthDict.get( ''.join(month_re).lower().strip( )) + "-" + ''.join(day_re) print("start_date_list1: ", start_date_list) if len(start_date_list) == len(campus_list): print("--------1-------") if len(campus_list) > 0: for c in range(len(campus_list)): item['campus'] = campus_list[c] item['start_date'] = start_date_list[ c].strip().strip(',').strip() yield item else: yield item else: print("--------2-------") item['start_date'] = ''.join( start_date_list).strip().strip(',').strip() yield item else: print("--------3-------") yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = getItem.get_item(ScrapyschoolCanadaCollegeItem) try: major_name_en = response.xpath( '/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/div[2]/div[2]/div[1]' ).extract()[0] major_name_en = remove_tags(major_name_en) major_name_en = major_name_en.replace('\r\n', '').replace( ' ', '').replace(' ', '').replace(' ', '') #major_name_en = # major_name_en = re.findall('(.*) - ',major_name_en)[0] # print(major_name_en) except: major_name_en = None # print(major_name_en) #programme_code try: programme_code = response.xpath( '//div[@class = "ProgramDetails"]').extract()[0] #programme_code = remove_tags(programme_code) programme_code = re.findall('.*\|(.*)', programme_code)[0] programme_code = programme_code.lstrip(' ') if 'Online' in programme_code: programme_code = 'No' #print(programme_code) except: programme_code = None #print(programme_code) try: duration = None #print(duration) except: duration = None duration_per = None #print(duration) #1.学校名称 school_name = 'LaSalle College' #2.地点 try: location = 'Montreal' # location = remove_tags(location) # print(location) except: location = None # print(location) #3. 校区 try: campus = 'Montreal' # campus = remove_tags(campus) # campus = campus.replace(', Online','') # campus = campus.replace(' ','') # campus = campus.split(',') #print(campus) except: campus = None #print(campus) #4. 学院 try: department = response.url department = re.findall('http://www.lasallecollege.com/(.*)/.*', department)[0] department = department.replace('-', ' ') if 'online courses' in department: department = 'No' else: pass #print(len(department)) # print(department) #print(response.url) except: department = None # print(department) # 4. try: degree_level = None degree_name = response.xpath( '//div[@class = "ProgramDetails"]').extract()[0] degree_name = re.findall('(.*)\|.*', degree_name)[0] degree_name = remove_tags(degree_name) degree_name = degree_name.replace('Ontario College ', '').replace(' ', '') if 'DEC' in degree_name: degree_name = 'A diploma of college studies' elif 'AEC' in degree_name: degree_name = 'An attestation of college studies' elif 'DEP' in degree_name: degree_name = 'A diploma of vocational studies' else: pass # print(degree_level) #print(degree_name) except: degree_level = None degree_name = None #print(degree_name) # #5.学位描述 try: degree_overview_en = response.xpath( '//div[@class = "programContent"]').extract() degree_overview_en = ''.join(degree_overview_en) #degree_overview_en = remove_tags(degree_overview_en) degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', degree_overview_en) #degree_overview_en = degree_overview_en.replace('\r\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace(' ',' ') #degree_overview_en = degree_overview_en.replace(' ','') #degree_overview_en = degree_overview_en.replace(' ','') #print(degree_overview_en) except: degree_overview_en = None # print(degree_overview_en) #6.专业英文 #7.专业介绍 try: #overview_en = degree_overview_en overview_en = degree_overview_en # print(overview_en) except: overview_en = degree_overview_en # print(overview_en) #8.入学时间 try: start_date = response.xpath( '//span[contains(text(),"Semester")]/following-sibling::span[1]' ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = start_date.replace('2019', '').replace( 'Spring', '2019-01,').replace('Winter', '2019-01,').replace( 'Summer', '2019-05,').replace('Fall', '2019-09,').replace( ' | ', '').replace(', ', ',').replace(', ', ',') start_date = start_date.rstrip(',') #start_date = # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01') #start_date = start_date.replace('Jan 2019','2019-01,').replace('Sep 2019','2019-09,').replace('Sep 2018','2018-09,').replace('May 2019','2019-05,').replace('Aug 2018','2018-08,') #start_date = start_date.rstrip(',') #start_date = start_date.replace(',,',',') #print(start_date) except: start_date = None #print(start_date) #10.课程设置 try: modules_en = response.xpath( '//div[contains(text(),"Concentration Courses")]/following-sibling::div|//div[contains(text(),"Specialized training")]/following-sibling::div' ).extract() modules_en = ''.join(modules_en) modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en) print(modules_en) except: modules_en = None print(modules_en) #11.就业方向 try: career_en = response.xpath( '//div[contains(text(),"Career Prospects")]/following-sibling::div' ).extract() career_en = ''.join(career_en) career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en) career_en = career_en.replace('\r\n', '').replace( '\n', '' ).replace('\t', '').replace( ' ', '' ).replace( ' ', '') # print(career_en) except: career_en = None # print(career_en) #12.截止日期 try: deadline = None #print(start_date) #print(deadline) except: deadline = None #print(deadline) #13.学费 try: tuition_fee = response.xpath( '//h3[contains(text(),"Tuition and Fees")]/following-sibling::div' ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) tuition_fee = re.findall('(\$\d\d,\d\d\d\.\d\d)', tuition_fee)[0] tuition_fee = tuition_fee.replace('$', '') #print(tuition_fee) # print(response.url) except: tuition_fee = None #print(tuition_fee) #14 申请费: apply_fee = '101' try: entry_requirements_en = response.xpath( '//div[contains(text(),"Admission Criteria")]/following-sibling::div' ).extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', entry_requirements_en) #entry_requirements_en = remove_tags(entry_requirements_en) #print(entry_requirements_en) #print(abc) except: entry_requirements_en = None # print(entry_requirements_en) #16 中国学生申请要求 try: require_chinese_en = entry_requirements_en #require_chinese_en = remove_tags(require_chinese_en) # print(require_chinese_en) except: require_chinese_en = None # print(require_chinese_en) #17 特殊专业要求 try: specific_requirement_en = entry_requirements_en specific_requirement_en = specific_requirement_en.replace('\n', '') specific_requirement_en = re.findall('(.*)Note:', specific_requirement_en) # #specific_requirement_en = remove_tags(specific_requirement_en) specific_requirement_en = ''.join(specific_requirement_en) specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', specific_requirement_en) # specific_requirement_en = specific_requirement_en.replace('\r\n','') # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0] # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul")) #print(specific_requirement_en) except: specific_requirement_en = None #print(specific_requirement_en) #18 高考(官网要求) try: gaokao_desc = response.xpath('').extract()[0] gaokao_desc = remove_tags(gaokao_desc) # print(gaokao_desc) except: gaokao_desc = None # print(gaokao_desc) #19 高考(展示以及判断字段) try: gaokao_zs = response.xpath('').extract()[0] gaokao_zs = remove_tags(gaokao_zs) # print(gaokao_zs) except: gaokao_zs = None # print(gaokao_zs) #22 会考描述 try: huikao_desc = response.xpath('').extract()[0] huikao_desc = remove_tags(huikao_desc) # print(huikao_desc) except: huikao_desc = None # print(huikao_desc) #23 会考描述 try: huikao_zs = response.xpath('').extract()[0] huikao_zs = remove_tags(huikao_zs) # print(huikao_zs) except: huikao_zs = None # print(huikao_zs) #25 雅思要求 #26 ielts try: ielts_desc = entry_requirements_en ielts_desc = re.findall('IELTS(.*)', ielts_desc)[0] #ielts = re.findall('\d\.\d',ielts) ielts_desc = remove_tags(ielts_desc) ielts_desc = 'IELTS ' + ielts_desc # print(ielts_desc) except: ielts_desc = None #print(ielts_desc) try: ielts_l = re.findall('.*(\d\.\d).*', ielts_desc)[0] ielts = float(ielts_l) + 0.5 #ielts_desc = remove_tags(ielts_desc) #print(ielts) except: ielts_desc = None ielts_l = None # print(ielts) #27 ielts_? ielts_s = ielts_l ielts_r = ielts_l ielts_w = ielts_l #print(ielts_l) #28 toefl_code try: toefl_code = None #toefl_code = remove_tags(toefl_code) # print(toefl_code) except: toefl_code = None # print(toefl_code) #29 toefl_desc try: toefl_desc = None #toefl_desc = remove_tags(toefl_desc) # print(toefl_desc) except: toefl_desc = None # print(toefl_desc) #30 toefl try: if '6.0' in ielts_l: toefl = '83' elif '5.5' in ielts_l: toefl = '80' else: toefl = None #toefl = re.findall('\d\d',toefl) #toefl = remove_tags(toefl) # print(toefl) except: toefl = None # print(toefl) #31 toefl_? toefl_l = None toefl_s = None toefl_r = None toefl_w = None #35 面试描述 try: interview_desc_en = response.xpath('').extract()[0] interview_desc_en = remove_tags(interview_desc_en) # print(interview_desc_en except: interview_desc_en = None # print(interview_desc_en) #36 作品集描述 try: portfolio_desc_en = response.xpath('').extract()[0] portfolio_desc_en = remove_tags(portfolio_desc_en) # print(portfolio_desc_en except: portfolio_desc_en = None # print(portfolio_desc_en) #37 other try: other = '时长在pdf中,deadline,学位类型待确认,fee没有, 学位类型对应网站:http://www.lasallecollege.com/futur-students/study-in-canada 没找到语言要求' #other = remove_tags(other) # print(other) except: other = None # print(other) #平均分 average_score average_score = None # degree_name_desc try: degree_name_desc = overview_en except: degree_name_desc = None # ap = 'Minimum grade of 3 or C ' item['school_name'] = school_name item['location'] = location item['campus'] = campus item['department'] = department item['degree_name'] = degree_name item['degree_name_desc'] = degree_name_desc item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['duration_per'] = None item['modules_en'] = modules_en item['career_en'] = career_en item['deadline'] = deadline item['apply_pre'] = 'CAD$' item['apply_fee'] = apply_fee item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = tuition_fee item['tuition_fee_per'] = '5' item['entry_requirements_en'] = entry_requirements_en item['require_chinese_en'] = require_chinese_en item['specific_requirement_en'] = specific_requirement_en item['average_score'] = average_score item['gaokao_desc'] = gaokao_desc item['gaokao_zs'] = gaokao_zs item['huikao_desc'] = huikao_desc item['huikao_zs'] = huikao_zs item['ielts_desc'] = '' item['ielts'] = '' item['ielts_l'] = '' item['ielts_s'] = '' item['ielts_r'] = '' item['ielts_w'] = '' item['toefl_code'] = toefl_code item['toefl_desc'] = toefl_desc item['toefl'] = '' item['toefl_l'] = '' item['toefl_s'] = '' item['toefl_r'] = '' item['toefl_w'] = '' item['interview_desc_en'] = interview_desc_en item['portfolio_desc_en'] = portfolio_desc_en item['other'] = other item['url'] = response.url item['degree_level'] = degree_level yield item
def parse_data(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Seneca College" item['url'] = response.url print("===========================") print(response.url) item[ 'location'] = '1750 Finch Avenue East Toronto, Ontario, Canada M2J 2X5' item['other'] = """问题描述: 1.没有找到截止日期""" # http://www.senecacollege.ca/international/apply/how-to-apply.html item['apply_pre'] = "CAD$" item['apply_fee'] = '65' try: major_name_en = response.xpath( "//div[@class='carousel-complementary-box']//h1/text()" ).extract() clear_space(major_name_en) # major_del = re.findall(r"\([A-Z]*\)", ''.join(major_name_en)) if len(major_name_en) > 0: item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) programme_code = response.xpath( "//div[@class='carousel-complementary-box']//h1/span//text()" ).extract() clear_space(programme_code) if len(programme_code) > 0: item['programme_code'] = ''.join(programme_code).replace( "(", "").replace(")", "").strip() print("item['programme_code']: ", item['programme_code']) start_date = response.xpath( "//p[contains(text(),'Starts:')]/text()").extract() clear_space(start_date) # print(start_date) start_date_end = "" if len(start_date) > 0: start_date_str = ''.join(start_date).replace("Starts:", "").strip() if "May" in start_date_str: start_date_end += "2019-05," if "September" in start_date_str: start_date_end += "2019-09," if "January" in start_date_str: start_date_end += "2020-01," item['start_date'] = start_date_end.strip().strip(',').strip() # print("item['start_date']: ", item['start_date']) campus = response.xpath( "//h5[contains(text(),'Campus')]/following-sibling::*//a//text()" ).extract() clear_space(campus) if len(campus) > 0: item['campus'] = ''.join(campus).strip() # print("item['campus']: ", item['campus']) department = response.xpath( "//h5[contains(text(),'School')]/following-sibling::*//text()" ).extract() clear_space(department) if len(department) > 0: item['department'] = ''.join(department).strip() # print("item['department']: ", item['department']) duration = response.xpath( "//h5[contains(text(),'Duration')]/following-sibling::*//text()" ).extract() clear_space(duration) # print("duration: ", duration) duration_re = re.findall(r"\d+\syear|\d+\smonth|\d+\sweek", ''.join(duration), re.I) # print("duration_re: ", duration_re) if len(duration_re) > 0: if "year" in ''.join(duration_re[0]).lower(): item['duration_per'] = 1 if "month" in ''.join(duration_re[0]).lower(): item['duration_per'] = 3 if "week" in ''.join(duration_re[0]).lower(): item['duration_per'] = 4 item['duration'] = ''.join( re.findall(r"\d+", ''.join(duration_re))).strip() # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) degree_name = response.xpath( "//h5[contains(text(),'Credential Awarded')]/following-sibling::*//text()" ).extract() clear_space(degree_name) if len(degree_name) > 0: item['degree_name'] = ''.join(degree_name).replace( "Ontario College", "").replace("Seneca College", "").strip() # print("item['degree_name']: ", item['degree_name']) if item['degree_name'] is not None: if "diploma" in item['degree_name'].lower(): item['degree_level'] = 3 elif "degree" in item['degree_name'].lower(): if "Bachelor" in item['major_name_en']: item['degree_name'] = item['major_name_en'] item['degree_level'] = 1 elif "graduate" in item['degree_name'].lower(): item['degree_level'] = 2 # print("item['degree_name']: ", item['degree_name']) # print("item['degree_level']: ", item['degree_level']) # 排除certificate的课程类型 if item['degree_level'] is not None: if item['degree_level'] == 1: item[ 'ielts_desc'] = "Overall band of 6.5. No single test score below 6.0" item['ielts'] = '6.5' item['ielts_l'] = '6.0' item['ielts_s'] = '6.0' item['ielts_r'] = '6.0' item['ielts_w'] = '6.0' item[ 'toefl_desc'] = 'Overall 84 with Writing, Reading, Listening and Speaking minimums of 21' item['toefl'] = '84' item['toefl_l'] = '21' item['toefl_s'] = '21' item['toefl_r'] = '21' item['toefl_w'] = '21' item[ 'require_chinese_en'] = """<p>When applying to a Seneca degree program you must submit:</p> <ul> <li>High school transcripts for grades 10, 11 and 12 showing all program specific pre-requisite courses.</li> <li>Transcripts must include six (6) senior level courses equivalent to Ontario university preparatory credits with an overall average of 65%.</li> <li>High school/secondary school diploma (certificate of completion).</li> <li>You may submit transcripts or certificates for any completed university/postsecondary college courses or programs taken inside or outside of Canada.</li> <li>Academic records that are in a language other than English must include an official/certified English translation.</li> <li>Seneca reserves the right to verify submitted transcripts at any time. For courses and credentials earned inside of Canada, Seneca reserves the right to request original transcripts be sent directly to Seneca from the issuing institution. Applicants will be notified if this is needed.</li> </ul>""" elif item['degree_level'] == 2: item[ 'ielts_desc'] = "Overall band of 6.5. No single test score below 6.0" item['ielts'] = '6.5' item['ielts_l'] = '6.0' item['ielts_s'] = '6.0' item['ielts_r'] = '6.0' item['ielts_w'] = '6.0' item[ 'toefl_desc'] = 'Overall 88 with Writing, Reading, Listening and Speaking minimums of 22' item['toefl'] = '88' item['toefl_l'] = '22' item['toefl_s'] = '22' item['toefl_r'] = '22' item['toefl_w'] = '22' item[ 'require_chinese_en'] = """<p>When applying to a Seneca graduate certificate program you must submit:</p> <ul> <li>Complete university and college transcripts for all years of study.</li> <li>University or college credential (degree, diploma or certificate).</li> <li>A credential assessment from a recognized agency such as WES (World Education Services) may be required for some programs. Applicants will be informed by email if this is needed.</li> <li>Academic records that are in a language other than English must include an official/certified English translation.</li> <li>Seneca reserves the right to verify submitted transcripts at any time. For courses and credentials earned inside of Canada, Seneca reserves the right to request original transcripts be sent directly to Seneca from the issuing institution. Applicants will be informed by email if this is needed.</li> </ul>""" elif item['degree_level'] == 3: item[ 'ielts_desc'] = "Overall band of 6.0. No single test score below 5.5" item['ielts'] = '6.0' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item[ 'toefl_desc'] = 'Overall 80 with Writing, Reading, Listening and Speaking minimums of 20' item['toefl'] = '80' item['toefl_l'] = '20' item['toefl_s'] = '20' item['toefl_r'] = '20' item['toefl_w'] = '20' item[ 'require_chinese_en'] = """<p>When applying to a Seneca two- or three-year diploma program you must submit:</p> <ul> <li>High school transcripts for grades 10, 11 and 12 showing all program specific pre-requisite courses.</li> <li>High school/secondary school diploma (certificate of completion).</li> <li>You may submit transcripts or certificates for any completed university/postsecondary college courses or programs taken inside or outside of Canada.</li> <li>Academic records that are in a language other than English must include an official/certified English translation.</li> <li>Seneca reserves the right to verify submitted transcripts at any time. For courses and credentials earned inside of Canada, Seneca reserves the right to request original transcripts be sent directly to Seneca from the issuing institution. Applicants will be notified if this is needed.</li> </ul>""" overview = response.xpath( "//h2[contains(text(),'About the Program')]/../../.." ).extract() if len(overview) > 0: item['overview_en'] = remove_class( clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) career_en = response.xpath( "//h3[contains(text(),'Related Careers')]/../../preceding-sibling::*" ).extract() if len(career_en) > 0: item['career_en'] = remove_class( clear_lianxu_space(career_en)) # print("item['career_en']: ", item['career_en']) # entry_requirements_en_url = response.xpath( "//a[contains(text(),'Admission requirements')]/@href" ).extract() # print("entry_requirements_en_url: ", entry_requirements_en_url) if len(entry_requirements_en_url) > 0: entry_requirements_interview = self.parse_entry_requirements_en( "http://www.senecacollege.ca" + entry_requirements_en_url[0]) item[ 'entry_requirements_en'] = entry_requirements_interview[ 0] item['interview_desc_en'] = entry_requirements_interview[ -1] # print("item['entry_requirements_en']: ", item['entry_requirements_en']) modules_url = response.xpath( "//a[contains(text(),'Courses')]/@href").extract() # print("modules_url: ", modules_url) if len(modules_url) > 0: item['modules_en'] = self.parse_modules( "http://www.senecacollege.ca" + modules_url[0]) # print("item['modules_en']: ", item['modules_en']) item['tuition_fee_per'] = "1" item['tuition_fee_pre'] = "CAD$" tuition_fee_url = response.xpath( "//a[contains(text(),'Costs')]/@href").extract() print("tuition_fee_url: ", tuition_fee_url) if len(tuition_fee_url) > 0: item['tuition_fee'] = self.parse_tuition_fee( "http://www.senecacollege.ca" + tuition_fee_url[0]) print("item['tuition_fee']: ", item['tuition_fee']) yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = getItem.get_item(ScrapyschoolCanadaCollegeItem) try: major_name_en = response.xpath('//h1').extract()[0] major_name_en = remove_tags(major_name_en) major_name_en = major_name_en.replace(' &', '') major_name_en = major_name_en.replace( 'Post-Degree Diploma in Applied Tourism and Hospitality Management', 'Applied Tourism and ospitality Management') # print(major_name_en) except: major_name_en = None # print(major_name_en) #programme_code try: programme_code = None # programme_code = remove_tags(programme_code) #programme_code = re.findall('\d\d\d', programme_code)[0] #print(programme_code) except: programme_code = None #print(programme_code) try: duration = response.xpath( '//dt[contains(text(),"Length")]/following-sibling::dd[1]' ).extract()[0] duration = remove_tags(duration) duration_per = '1' if 'Two or four years' in duration: duration = '2,4' elif '2 ½ to 3 years' in duration: duration = '2,3' elif 'Two Semesters' in duration: duration = '2' duration_per = '2' elif '8 months' in duration: duration = '8' duration_per = '3' elif '10 months' in duration: duration = '10' duration_per = '3' elif 'Two years' in duration and 'Two' in duration: duration = '2' duration_per = '1' elif 'One or two years' in duration: duration = '1,2' elif '20 consecutive months' in duration: duration = '20' duration_per = '3' elif 'One year' in duration: duration = '1' duration_per = '1' elif '8-12 months' in duration: duration_per = '1' duration = '1' elif 'Varies' in duration: duration = 'No' elif 'One, Two, or Four years' in duration: duration = '1,2,4' elif '16 months' in duration: duration = '16' duration_per = '3' elif 'Two years' in duration: duration = '2' elif 'Four years' in duration: duration = '4' elif 'Two academic years' in duration: duration = '2' elif '3 years' in duration: duration = '3' elif 'One year' in duration: duration = '1' # print(duration) except: duration = 'No' duration_per = None # print(duration) #1.学校名称 school_name = 'Camosun College' #2.地点 try: location = response.xpath( '//dt[contains(text(),"Campus")]/following-sibling::dd[1]' ).extract() location = ''.join(location) location = location.replace(' ', '').replace(' ', '') location = remove_tags(location) # print(location) except: location = None # print(location) #3. 校区 try: campus = location # campus = remove_tags(campus) # campus = campus.replace(', Online','') # campus = campus.replace(' ','') # campus = campus.split(',') #print(campus) except: campus = None #print(campus) #4. 学院 try: department = response.xpath( '//dt[contains(text(),"School")]/following-sibling::dd[1]' ).extract() department = ''.join(department) department = remove_tags(department) department = department.replace(' &', '') #print(department) # print(department) #print(response.url) except: department = None # print(department) # 4. try: degree_name = response.xpath( '//*[@id="page-banner"]/div/div/div/h2').extract()[0] degree_name = remove_tags(degree_name) if 'Associate' in degree_name: degree_level = '4' elif 'Advanced Diploma' in degree_name: degree_level = '4' elif 'Bachelor\'s Degree' in degree_name or 'Bachelor of' in degree_name: degree_level = '1' elif 'Post-Degree Diploma' in degree_name: degree_level = '2' elif 'Diploma' in degree_name: degree_level = '3' else: degree_level = 'No' if 'in' in degree_name: degree_name = re.findall('(.*) in', degree_name)[0] #print(degree_level) #print(degree_name) except: degree_level = '' degree_name = None #print(degree_name) # #5.学位描述 try: degree_overview_en = response.xpath( '//*[@id="page-introduction"]/div/p').extract() degree_overview_en = ''.join(degree_overview_en) #degree_overview_en = remove_tags(degree_overview_en) degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', degree_overview_en) #degree_overview_en = degree_overview_en.replace('\r\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace(' ',' ') #degree_overview_en = degree_overview_en.replace(' ','') #degree_overview_en = degree_overview_en.replace(' ','') # print(degree_overview_en) except: degree_overview_en = None # print(degree_overview_en) #6.专业英文 #7.专业介绍 try: #overview_en = degree_overview_en overview_en = degree_overview_en # print(overview_en) except: overview_en = degree_overview_en # print(overview_en) #8.入学时间 try: start_date = response.xpath( '//dt[contains(text(),"Start")]/following-sibling::dd[1]' ).extract() start_date = ','.join(start_date) start_date = remove_tags(start_date) if 'September, January or May' in start_date or 'September, January, or May' in start_date or 'September, January, May' in start_date or 'January, May or September' in start_date: start_date = '2019-01,2019-05,2019-09' elif 'Full-time & Part-time: September Part-time: January' in start_date or 'September or January (Part Time)' in start_date: start_date = '2019-09' elif 'UVic Bridge: January UBC Bridge: September ' in start_date: start_date = '2019-01,2019-09' elif 'January 2020 (every other year)' in start_date: start_date = '2020-01' elif 'Fall, Winter' in start_date: start_date = '2019-01,2019-09' elif 'September' in start_date or 'Fall' in start_date: start_date = '2019-09' #print(start_date) except: start_date = None #print(start_date) #10.课程设置 try: if 'calendar' in response.url: url_list = response.url else: url_list = re.findall('subjects/(.*)/', response.url)[0] url_list = re.findall('(\w\w\w\w).*', url_list)[0] url_list = 'http://camosun.ca/learn/calendar/current/web/' + url_list + '.html' #print(url_list) # print(url_list) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36" } response1 = requests.get(url_list, headers=headers) response1 = response1.text modules_en = response1 modules_en = ''.join(modules_en) # modules_en = ''.join(modules_en) modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', modules_en) modules_en = ''.join(modules_en) modules_en = modules_en.replace('\r\n', '').replace('\n', '') modules_en = re.findall('</h1>(.*)', modules_en)[0] modules_en = re.findall('(.*)Contact Us', modules_en)[0] print(modules_en) #modules_en = modules_en.replace('\r\n','').replace('\n','').replace('\t','').replace(' ','').replace(' ','') # print(modules_en) except: modules_en = None print(modules_en) #11.就业方向 try: career_en = response.xpath( '//h3[contains(text(),"opportunities")]/following-sibling::*' ).extract() career_en = ''.join(career_en) career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', career_en) # career_en = career_en.replace('\r\n','').replace('\n','').replace('\t','').replace(' ','').replace(' ','') # print(career_en) except: career_en = None #print(career_en) #12.截止日期 try: deadline = start_date.replace('-09', '-07-10').replace( '-01', '-11-10').replace('-05', '-03-10') #print(start_date) #print(deadline) except: deadline = None #print(deadline) #13.学费 try: if 'calendar' in response.url: url_list = response.url else: url_list = re.findall('subjects/(.*)/', response.url)[0] url_list = re.findall('(\w\w\w\w).*', url_list)[0] url_list = 'http://camosun.ca/learn/calendar/current/web/' + url_list + '.html' print(url_list) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36" } response2 = requests.get(url_list, headers=headers) response2 = response2.text response2 = etree.HTML(response2) tuition_fee = response2.xpath( '//*[@id="content"]/div/div/div[2]/text()')[0] #print(tuition_fee) # print(response.url) except: tuition_fee = None #print(tuition_fee) #14 申请费: apply_fee = '100' try: url_list = response.url url_list = url_list + '/admission-requirements/index.html' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36" } response2 = requests.get(url_list, headers=headers) response2 = response2.text entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]', '', response2) entry_requirements_en = entry_requirements_en.replace('\r\n', '').replace( '\n', '') entry_requirements_en = re.findall( '<h2>Admission Requirements</h2>(.*)', entry_requirements_en)[0] print(entry_requirements_en) #print(abc) except: entry_requirements_en = None #print(entry_requirements_en) #16 中国学生申请要求 try: require_chinese_en = '<div> <h2>Step 1 - Find a program</h2> <p>From academic upgrading to business administration degrees and health care technical training, Camosun offers something for everyone. <a>Browse our program list</a>.</p> <h3>English proficiency levels</h3> <p>Admission to most academic programs is based on meeting <strong>one of the following</strong> English proficiency levels:</p> <ul> <li>Completion of BC English 12 with a <strong>C+</strong> or better</li> <li>Academic IELTS</li> <li>TOEFL iBT</li> <li><a>Camosun English assessment</a></li> </ul> <p>You can also apply for <a>English upgrading</a> in order to meet your English proficiency for program admission.</p> <p><strong>Note: </strong>Academic IELTS and TOEFL iBT scores must be from within the past two years. For TOEFL we require an original document from <a>ETS</a> to Camosun College: Destination (DI) Code: 7527</p> <p>IELTS or TOEFL requirements for <strong>undergraduate</strong> programs:</p> <ul> <li><strong>Academic IELTS score of 6.0</strong> with no individual band less than 5.5</li> <li><strong>TOEFL iBT score of 83</strong> with no score less than 20 on each level </li> </ul> <p>IELTS or TOEFL requirements for <strong>post-graduate</strong> programs:</p> <ul> <li><strong>Academic IELTS score of 6.5</strong> with no individual band less than 6.0</li> <li><strong>TOEFL iBT score of 88</strong> with no score less than 20 on each level</li> </ul> <p><span>Note</span> Applicants from countries where <strong>Study Direct Stream (SDS)</strong> or <strong>Canada Express Study (CES)</strong> are available must meet corresponding SDS/CES criteria in addition to Camosun admission requirements.</p> <p>Please see our <a>important dates calendar</a> for start dates and tuition deadlines.</p> <div> <h3>Consider Camosun Homestay</h3> <p>Experience Victoria with a Camosun Homestay family. You must apply at least six weeks before the start of your program. <a>Learn more</a> about the Camosun Homestay program.</p> </div> </div>' #require_chinese_en = remove_tags(require_chinese_en) # print(require_chinese_en) except: require_chinese_en = None # print(require_chinese_en) #17 特殊专业要求 try: specific_requirement_en = entry_requirements_en # specific_requirement_en = specific_requirement_en.replace('\n','') # specific_requirement_en = re.findall('(.*)Note:',specific_requirement_en) # #specific_requirement_en = remove_tags(specific_requirement_en) #specific_requirement_en = ''.join(specific_requirement_en) # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en) # specific_requirement_en = specific_requirement_en.replace('\r\n','') # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0] # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul")) #print(specific_requirement_en) except: specific_requirement_en = None #print(specific_requirement_en) #18 高考(官网要求) try: gaokao_desc = response.xpath('').extract()[0] gaokao_desc = remove_tags(gaokao_desc) # print(gaokao_desc) except: gaokao_desc = None # print(gaokao_desc) #19 高考(展示以及判断字段) try: gaokao_zs = response.xpath('').extract()[0] gaokao_zs = remove_tags(gaokao_zs) # print(gaokao_zs) except: gaokao_zs = None # print(gaokao_zs) #22 会考描述 try: huikao_desc = response.xpath('').extract()[0] huikao_desc = remove_tags(huikao_desc) # print(huikao_desc) except: huikao_desc = None # print(huikao_desc) #23 会考描述 try: huikao_zs = response.xpath('').extract()[0] huikao_zs = remove_tags(huikao_zs) # print(huikao_zs) except: huikao_zs = None # print(huikao_zs) #25 雅思要求 ielts_desc = 'Academic IELTS score of 6.0 with no individual band less than 5.5/Academic IELTS score of 6.5 with no individual band less than 6.0' #26 ielts try: if degree_level == 2: ielts = '6.5' ielts_l = '6.0' toefl = '88' else: ielts_l = '5.5' ielts = '6.0' toefl = '83' except: ielts_l = None ielts = None toefl = None #27 ielts_? ielts_s = ielts_l ielts_r = ielts_l ielts_w = ielts_l toefl_l = '20' toefl_s = '20' toefl_w = '20' toefl_r = '20' #print(ielts_l) #28 toefl_code try: toefl_code = '7527' #toefl_code = remove_tags(toefl_code) # print(toefl_code) except: toefl_code = None # print(toefl_code) #29 toefl_desc try: toefl_desc = 'TOEFL iBT score of 83 with no score less than 20 on each level/TOEFL iBT score of 88 with no score less than 20 on each level' #toefl_desc = remove_tags(toefl_desc) # print(toefl_desc) except: toefl_desc = None # print(toefl_desc) #35 面试描述 try: interview_desc_en = response.xpath('').extract()[0] interview_desc_en = remove_tags(interview_desc_en) # print(interview_desc_en except: interview_desc_en = None # print(interview_desc_en) #36 作品集描述 try: portfolio_desc_en = response.xpath('').extract()[0] portfolio_desc_en = remove_tags(portfolio_desc_en) # print(portfolio_desc_en except: portfolio_desc_en = None # print(portfolio_desc_en) #37 other try: other = '没有课程代码,跳转3次页面跑的特别慢' #other = remove_tags(other) # print(other) except: other = None # print(other) #平均分 average_score average_score = None # degree_name_desc try: degree_name_desc = overview_en except: degree_name_desc = None # ap = 'Minimum grade of 3 or C ' item['school_name'] = school_name item['location'] = location item['campus'] = campus item['department'] = department item['degree_name'] = degree_name item['degree_name_desc'] = degree_name_desc item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['duration_per'] = duration_per item['modules_en'] = modules_en item['career_en'] = career_en item['deadline'] = deadline item['apply_pre'] = 'CAD$' item['apply_fee'] = apply_fee item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = tuition_fee item['tuition_fee_per'] = '5' item['entry_requirements_en'] = entry_requirements_en item['require_chinese_en'] = require_chinese_en item['specific_requirement_en'] = specific_requirement_en item['average_score'] = average_score item['gaokao_desc'] = gaokao_desc item['gaokao_zs'] = gaokao_zs item['huikao_desc'] = huikao_desc item['huikao_zs'] = huikao_zs item['ielts_desc'] = ielts_desc item['ielts'] = ielts item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['toefl_code'] = toefl_code item['toefl_desc'] = toefl_desc item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_s'] = toefl_s item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['interview_desc_en'] = interview_desc_en item['portfolio_desc_en'] = portfolio_desc_en item['other'] = other item['url'] = response.url item['degree_level'] = degree_level
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Humber College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.programme_code programme_code = response.xpath("//*[contains(text(),'Program Code: ')]").extract() programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code).replace('Program Code: ','') # print(programme_code) #5.overview_en overview_en = response.xpath("//*[contains(text(),'Courses')]//preceding-sibling::*").extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #6.modules_en modules_en = response.xpath("//div[contains(@class,'curriculum')]").extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #7.career_en career_en = response.xpath("//div[contains(@class,'learning-outcomes')]").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #8.start_date start_date = response.xpath('//td[@data-label="START DATE"]').extract() start_date = ','.join(start_date) start_date = remove_tags(start_date).replace('September 2019','2019-09').replace('January 2020','2020-01').replace('January 2019','2019-01').replace('May 2019','2019-05').replace('January 2020','2019-01').replace('May 2020','2020-05') # print(start_date) #9.department department = response.xpath('//*[@id="HumberNav"]/div[2]/h1/a').extract() department = ''.join(department) department = remove_tags(department).replace('& ','') # print(department) #10.tuition_fee_pre tuition_fee_pre = '$' item['school_name'] = school_name item['url'] = url item['location'] = location item['programme_code'] = programme_code item['overview_en'] = overview_en item['modules_en'] = modules_en item['career_en'] = career_en item['start_date'] = start_date item['department'] = department item['tuition_fee_pre'] = tuition_fee_pre yield item
def parse_data(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Northern Alberta Institute of Technology" item['url'] = response.url print("===========================") print(response.url) # item['campus'] = '' item[ 'location'] = '11762 - 106 Street Edmonton, Alberta, Canada, T5G 2R1' item['other'] = """问题描述: 1.没有专业代码和中国学生要求""" # item['require_chinese_en'] = '' # http://www.nait.ca/88953.htm item['apply_pre'] = "CAD$" item['apply_fee'] = '115' try: major_name_en = response.xpath( "//section[@id='content']/h1//text()").extract() clear_space(major_name_en) item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) # //span[@title='Campus']/following-sibling::span campus = response.xpath( "//span[@title='Campus']/following-sibling::span//text()" ).extract() clear_space(campus) item['campus'] = ''.join(campus).strip() # print("item['campus']: ", item['campus']) degree_name = response.xpath( "//span[@title='Credential']/following-sibling::span/text()" ).extract() clear_space(degree_name) item['degree_name'] = ''.join(degree_name).strip() # print("item['degree_name']: ", item['degree_name']) if item['degree_name'] == "Diploma": item['degree_level'] = 3 if "degree" in item['degree_name'].lower(): item['degree_level'] = 1 item['degree_name'] = item['major_name_en'] if "Post" in item['degree_name']: item['degree_level'] = 2 print("item['degree_name']1: ", item['degree_name']) # print("item['degree_level']: ", item['degree_level']) if item['degree_level'] is not None: duration = response.xpath( "//span[@title='Program Length']/following-sibling::span//text()" ).extract() clear_space(duration) # print("duration: ", duration) # 判断课程长度单位 if "year" in ''.join(duration).lower(): item['duration_per'] = 1 if "month" in ''.join(duration).lower(): item['duration_per'] = 3 if "week" in ''.join(duration).lower(): item['duration_per'] = 4 duration_re = re.findall(r"\d+\syear", ''.join(duration)) # print("duration_re: ", duration_re) if len(duration_re) > 0: item['duration'] = ''.join(duration_re[0]).replace( "year", "").strip() # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) start_date = response.xpath( "//strong[contains(text(),'Start Date')]/../text()" ).extract() clear_space(start_date) # print("start_date: ", start_date) monthDict = { "january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06", "july": "07", "august": "08", "september": "09", "october": "10", "november": "11", "december": "12", "jan": "01", "feb": "02", "mar": "03", "apr": "04", "jun": "06", "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12", "sept": "09", } start_date_str = "" deadline_str = "" if len(start_date) > 0: # 将每个日期转成需要的日期格式 for sta in range(len(start_date)): month_re = re.findall(r"[A-Za-z]+", start_date[sta]) day_re = re.findall(r"\d+,", start_date[sta]) year_re = re.findall(r"\d{4}", start_date[sta]) # print(monthDict.get(''.join(month_re).lower().strip())) if monthDict.get( ''.join(month_re).lower().strip()) is not None: start_date[sta] = ''.join( year_re) + "-" + monthDict.get( ''.join(month_re).lower().strip( )) + "-" + ''.join(day_re) # print("start_date: ", start_date) for s in range(0, len(start_date), 2): # print("s == ", s) start_date_str += start_date[s] item['start_date'] = start_date_str.strip().strip( ",").strip() for s in range(1, len(start_date), 2): # print("sd == ", s) deadline_str += start_date[s] item['deadline'] = deadline_str.strip().strip(",").strip() # print("item['start_date']: ", item['start_date']) # print("item['deadline']: ", item['deadline']) overview = response.xpath( "//div[@id='program-quick-facts']/preceding-sibling::*" ).extract() if len(overview) == 0: overview = response.xpath( "//div[@id='program-quick-facts']/../text()").extract( ) if len(overview) > 0: item['overview_en'] = remove_class( clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) career_en = response.xpath( "//h3[@class='prepend-top']/../*[position()<last()]" ).extract() if len(career_en) > 0: item['career_en'] = remove_class( clear_lianxu_space(career_en)) # print("item['career_en']: ", item['career_en']) modules_url = response.xpath( "//li/a[@id='sidenav-child'][contains(text(), 'Courses')]/@href" ).extract() # print("modules_url: ", modules_url) if len(modules_url) > 0: item['modules_en'] = self.parse_modules(modules_url[0]) # print("item['modules_en']: ", item['modules_en']) major_key = [ "Academic Upgrading ", "Alternative Energy Technology", "Animal Health Technology", "Applied Financial Services", "Architectural Technology", "Bachelor of Applied Business Administration - Accounting", "Bachelor of Applied Business Administration - Finance", "Bachelor of Applied Information Systems Technology", "Bachelor of Business Administration", "Bachelor of Technology - Construction Management", "Bachelor of Technology in Technology Management", "Baking and Pastry Arts", "Biological Sciences Technology", "Biomedical Engineering Technology", "Biomedical Engineering Technology Co-op", "Building Environmental Systems Technology", "Business Administration - Accounting", "Business Administration - Finance", "Business Administration - Human Resources Management", "Business Administration - Management", "Business Administration - Marketing", "Business Administration - Year One", "Captioning and Court Reporting", "Chemical Engineering Technology", "Chemical Technology", "Chemical Technology Co-op", "Civil Engineering Technology", "Civil Engineering Technology Co-op", "CNC Machinist Technician", "Computer Engineering Technology", "Computer Engineering Technology Co-op", "Computer Network Administrator", "Construction Engineering Technology", "Culinary Arts", "Cytotechnology", "Dental Assisting Technology", "Dental Technology", "Denturist Technology", "Digital Media and IT", "Digital Media and IT Co-op", "Electrical Engineering Technology", "Electronics Engineering Technology", "Electronics Engineering Technology Co-op", "Emergency Management", "Emergency Management Certificate", "Engineering Design and Drafting Technology", "Forest Technology", "Geological Technology", "Geomatics Engineering Technology", "Graphic Communications", "Hospitality Management", "HVAC-R Technician Certificate", "Industrial Heavy Equipment Technology", "Instrumentation Engineering Technology", "Instrumentation Eng Technology Co-op", "Interior Design Technology", "Landscape Architectural Technology", "Materials Engineering Technology", "Mechanical Engineering Technology", "Medical Laboratory Assistant", "Medical Laboratory Technology", "Medical Transcription", "Millwork & Carpentry Certificate", "Nanotechnology Systems", "Network Engineering Technology", "Network Engineering Technology Co-op", "Occupational Health and Safety", "Optical Sciences - Contact Lenses", "Optical Sciences - Eyeglasses", "Personal Fitness Trainer ", "Petroleum Engineering Technology", "Photographic Technology ", "Power Engineering - Fourth Class", "Power Engineering Technology ", "Pre-Employment - Auto Body Repair ", "Pre-Trades - Automotive Service Technician", "Radio & Television - Radio", "Respiratory Therapy", "Retail Meatcutting Certificate", "Veterinary Medical Assistant ", "Water & Wastewater Technician ", "Wireless Systems Engineering Technology", ] tuition_fee_value = [ "7,814", "8,653", "7,814", "7,814", "8,051", "8.051", "8,051", "8,653", "8,051", "8,993", "8,653", "8,051", "8,051", "8,993", "8,993", "8,993", "8,051", "8,051", "8,051", "8,051", "8,051", "8,051", "7,814", "8,653", "8,653", "8,653", "8,993", "8,993 ", "8,653", "8,051", "8,051", "8,051", "8,993", "8,051", "8,993", "8,653", "8,051", "8,653", "8,051", "8,051", "8,993", "8,051", "8,051", "8,993", "8,993", "8,653", "8,051", "8,051", "8,993", "7,814", "7,814", "8,653", "8,653", "8,993", "8,993", "7,814", "8,051", "8,653", "8,653", "8,051", "8,653", "8,653", "7,814", "8,653", "8,653", "8,653", "8,653", "8,051", "8,051", "7,814", "8,993", "7,814", "8,993", "8,993", "8,051", "8,653", "7,814", "8,993", "8,051", "7,814", "8,993", "8,653", ] tuition_fee_dict = {} for t in range(len(tuition_fee_value)): tuition_fee_dict[ major_key[t].strip()] = tuition_fee_value[t].strip() item['tuition_fee'] = tuition_fee_dict.get( item['major_name_en']) item['tuition_fee_per'] = "1" item['tuition_fee_pre'] = "CAD$" if "Biological Sciences Technology" in item['major_name_en']: item['tuition_fee'] = "8,051" if "Radio & Television" in item['major_name_en']: item['tuition_fee'] = "7,814" # print("item['tuition_fee']: ", item['tuition_fee']) # print("item['tuition_fee_per']: ", item['tuition_fee_per']) # print("item['tuition_fee_pre']: ", item['tuition_fee_pre']) # http://www.nait.ca/56525.htm if item['major_name_en'] == "Bachelor of Applied Information Systems Technology" or item[ 'major_name_en'] in [ "Animal Health Technology", "Laboratory & X-ray Combined", "Dental Assisting Technology", "Dental Technology", "Denturist Technology", "Diagnostic Medical Sonography", "Emergency Medical Technology – Paramedic", "Magnetic Resonance", "Magnetic Resonance Imaging", "Medical Laboratory Assisting", "Medical Laboratory Technology", "Medical Radiologic Technology", "Veterinary Medical Assistant" ]: item['ielts_l'] = '6.0' item['ielts_s'] = '6.5' item['ielts_r'] = '6.0' item['ielts_w'] = '6.0' item['toefl_l'] = '20' item['toefl_s'] = '23' item['toefl_r'] = '20' item['toefl_w'] = '20' elif "Business Administration" in item['major_name_en']: item['ielts'] = '5.5' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' item['toefl'] = '74' elif item['major_name_en'] == "Respiratory Therapy": item['ielts_l'] = '8.0' item['ielts_s'] = '7.0' item['ielts_r'] = '7.0' item['ielts_w'] = '7.0' item['toefl_l'] = '28' item['toefl_s'] = '23' item['toefl_r'] = '24' item['toefl_w'] = '27' else: item['ielts'] = '6.5' item['ielts_l'] = '5.0' item['ielts_s'] = '5.5' item['ielts_r'] = '5.0' item['ielts_w'] = '5.0' item['toefl'] = '80' item['toefl_l'] = '20' item['toefl_s'] = '20' item['toefl_r'] = '20' item['toefl_w'] = '20' entry_requirements_en_url = response.xpath( "//li/a[@id='sidenav-child'][contains(text(), 'About the Program')]/@href" ).extract() # print("entry_requirements_en_url: ", entry_requirements_en_url) major_list = None if len(entry_requirements_en_url) > 0: datadict = self.parse_entry_requirements_en( entry_requirements_en_url[0]) item['entry_requirements_en'] = datadict.get( 'entry_requirements_en') major_list = datadict.get('major_list') # print("item['entry_requirements_en']: ", item['entry_requirements_en']) if major_list is not None: if "Emphasis" in ' '.join(major_list): for major in major_list: item['major_name_en'] = major.replace( "Emphasis", "").strip() yield item else: yield item else: yield item # yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Conestoga College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath('//*[@id="maincontent"]/h1').extract() major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en).strip() # print(major_name_en) #5.degree_name #6.degree_level degree_name = response.xpath( "//dt[contains(text(),'Credential:')]//following-sibling::dd[1]" ).extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) if 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = 3 elif 'Graduate Certificate' in degree_name: degree_name = 'Graduate Certificate' degree_level = 2 elif 'Diploma' in degree_name: degree_name = 'Diploma' degree_level = 3 else: degree_name = degree_name degree_level = 1 # print(degree_name) #7.campus campus = response.xpath( "//*[@id='program-status']/table//tr[2]/td[2]").extract() campus = ''.join(campus) campus = remove_tags(campus).strip() # print(campus,url) #8.programme_code programme_code = response.xpath( "//dt[contains(text(),'Program Code:')]//following-sibling::*[1]" ).extract() programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code).strip() # print(programme_code) #9.department department = response.xpath( "//dt[contains(text(),'School:')]//following-sibling::*[1]" ).extract() department = ''.join(department) department = remove_tags(department).replace('& ', '') # print(department) #10.start_date start_date = response.xpath( '//*[@id="program-status"]/table//tr/td[1]').extract() start_date = ''.join(start_date).strip() start_date = remove_tags(start_date) start_date = clear_space_str(start_date) start_date = re.findall('[A-Z]{3},\s\d+', start_date) start_date = ','.join(start_date).replace('\xa0', '').replace( 'JAN,2019', '2019-01').replace('MAY,2019', '2019-05').replace( 'SEP,2019', '2019-09').replace('JAN,2020', '2020-01').replace('MAY,2020', '2020-05') start_date = start_date.replace('AUG,2019', '2019-08') # print(start_date) #11.overview_en overview_en_a = response.xpath('//*').extract() overview_en_a = ''.join(overview_en_a) # print(overview_en_a) overview_en = re.findall( '<h2[\sA-Za-z\'\"=><-]+About the Program</h2>\s(.*?)\s <!-- The follow session is for program content information -->', overview_en_a, re.S)[0] overview_en = remove_class(overview_en).replace('\t', '').replace( '\r', '').replace('\n', '') # print(overview_en,url) #12.modules_en modules_en = response.xpath('//*[@id="pc-noncoop"]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #13.career_en career_en = response.xpath( "//*[contains(text(),'Program Outcomes')]//following-sibling::*[1]" ).extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #14.tuition_fee tuition_fee = 6000 #15.tuition_fee_pre tuition_fee_pre = '$' #16.entry_requirements_en entry_requirements_en = response.xpath( "//*[contains(text(),'Admission Requirements')]//following-sibling::*[1]" ).extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = remove_class(entry_requirements_en) # print(entry_requirements_en) #17.specific_requirement_en specific_requirement_en = response.xpath( "//*[contains(text(),'Program Requirements')]//following-sibling::*[1]" ).extract() specific_requirement_en = ''.join(specific_requirement_en) specific_requirement_en = remove_class(specific_requirement_en) # print(specific_requirement_en) #18.ielts_desc 19.toefl_desc 20-25 ielts_desc = '<p>DIPLOMA/CERTIFICATE (EXCLUDING Practical Nursing Diploma) 6.0 IELTS (with no band less than 5.5) 80 TOEFL DEGREE (INCLUDING Practical Nursing Diploma) 6.5 IELTS (with no band less than 6.0) 88 TOEFL GRADUATE CERTIFICATE (Some IT/engineering program required test scores may vary)6.5 IELTS (with no band less than 6.0) 88 TOEFL</p>' toefl_desc = ielts_desc if degree_level == 1: ielts = 6.5 ielts_r = 6 ielts_w = 6 ielts_s = 6 ielts_l = 6 toefl = 88 elif degree_level == 2: ielts = 6.5 ielts_r = 6 ielts_w = 6 ielts_s = 6 ielts_l = 6 toefl = 88 else: ielts = 6 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 toefl = 80 item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['degree_name'] = degree_name item['degree_level'] = degree_level item['campus'] = campus item['programme_code'] = programme_code item['department'] = department item['start_date'] = start_date item['overview_en'] = overview_en item['modules_en'] = modules_en item['career_en'] = career_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['entry_requirements_en'] = entry_requirements_en item['specific_requirement_en'] = specific_requirement_en item['ielts_desc'] = ielts_desc item['toefl_desc'] = toefl_desc item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Fleming College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath('//*[@id="page-title"]/div/div/h2/text()').extract()[0] # print(major_name_en) #5.programme_code try: programme_code = response.xpath("//div[contains(text(),'Program code')]//following-sibling::*").extract()[0] programme_code = remove_tags(programme_code) programme_code = clear_space_str(programme_code) except: programme_code = None # print(programme_code) #6.degree_name #7.degree_level try: degree_name = response.xpath("//span[contains(text(),'Credential: ')]//following-sibling::strong").extract()[0] degree_name = remove_tags(degree_name) except: degree_name = '' if 'Graduate Certificate' in degree_name: degree_name = 'Graduate Certificate' degree_level = 2 elif 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = 3 elif 'Diploma' in degree_name: degree_name = 'Diploma' degree_level = 3 else: degree_name = None degree_level = None # print(degree_name) #8.duration #9.duration_per duration = response.xpath("//span[@class='program-credential-length']").extract() duration = ''.join(duration) duration = remove_tags(duration) duration = re.findall('\d',duration)[0] duration_per = 2 # print(duration) #10.start_date start_date = response.xpath('//*[@id="program-tabs"]/li/a').extract() start_date = ''.join(start_date) start_date = re.findall('>([a-zA-Z0-9\s]+)<',start_date) start_date =','.join(start_date).replace('September 2019','2019-09').replace('May 2019','2019-05').replace('January 2019','2019-01') # print(start_date) #11.campus try: campus = response.xpath("//div[contains(text(),'Offered at:')]//following-sibling::div/a").extract()[0] campus = ''.join(campus) campus = remove_tags(campus) except: campus = None # print(campus) #12.overview_en overview_en = response.xpath("//h3[contains(text(),'Program Highlights')]//following-sibling::*").extract() overview_en = ''.join(overview_en) end = overview_en.find('<h3') overview_en = overview_en[:end] overview_en = remove_class(overview_en) # print(overview_en) # print(url) #13.career_en career_en = response.xpath("//h3[contains(text(),'Career Opportunities')]//following-sibling::*").extract() career_en = ''.join(career_en) end = career_en.find('<h3') career_en = career_en[:end] career_en = remove_class(career_en) # print(career_en) # print(url) #14.entry_requirements_en entry_requirements_en = response.xpath("//h3[contains(text(),'Minimum Admission Requirements')]//following-sibling::*").extract() entry_requirements_en = ''.join(entry_requirements_en) end = entry_requirements_en.find('<div') entry_requirements_en = entry_requirements_en[:end] entry_requirements_en = remove_class(entry_requirements_en) # print(entry_requirements_en) # print(url) #15.modules_en modules_en_url = response.xpath("//a[@class='icon icon-list']//@href").extract()[0] modules_en_url = 'https://flemingcollege.ca' + modules_en_url # print(modules_en_url) if len(modules_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} data2 = requests.get(modules_en_url, headers=headers) response2 = etree.HTML(data2.text) modules_en = response2.xpath('//*[@id="content"]/div[2]') doc2 = "" if len(modules_en) > 0: for a in modules_en: doc2 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc2 = remove_class(doc2) modules_en = doc2 else: modules_en = None else: modules_en = None # print(modules_en) # print(modules_en) #16.other other = '1.缺少deadline,2.页面详情页没有学院,3.部分专业没有学费,4.' #17.tuition_fee tuition_fee = response.xpath("//span[contains(text(),'International:')]/../following-sibling::*").extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee) try: tuition_fee = re.findall('\$([0-9,\.]+)',tuition_fee)[0] except: tuition_fee = None # print(tuition_fee,url) #18.tuition_fee_pre tuition_fee_pre = '$' #19.apply_pre apply_pre = '$' #20.apply_fee apply_fee = '95' #21.ielts 22232425 ielts = 6.0 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 #26.toefl toefl = 65 item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['degree_name'] = degree_name item['degree_level'] = degree_level item['duration'] = duration item['duration_per'] = duration_per item['start_date'] = start_date item['campus'] = campus item['overview_en'] = overview_en item['career_en'] = career_en item['entry_requirements_en'] = entry_requirements_en item['modules_en'] = modules_en item['other'] = other item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_pre'] = apply_pre item['apply_fee'] = apply_fee item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl # yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Sheridan College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath( '//*[@id="main"]/article/header[1]/div/div/div/div[1]/h1').extract( ) major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en) # print(major_name_en) #5.programme_code try: programme_code = response.xpath( "//li[contains(text(),'Program code: ')]//b").extract()[0] programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code) except: programme_code = None # print(programme_code,url) #6.department department = response.xpath( '//*[@id="main"]/article/header[1]/div/div/div/div[1]/div[2]' ).extract() department = ''.join(department) department = remove_tags(department).replace('& ', '') # print(department) #7.degree_name 8.degree_level try: degree_name = response.xpath( "//div[contains(@class,'plan-offering-header')]//h4").extract( )[0] degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) except: degree_name = '' if 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = 3 elif 'Diploma' in degree_name: degree_name = 'Diploma' degree_level = 3 elif 'Graduate Certificate' in degree_name: degree_name = 'Graduate Certificate' degree_level = 2 elif 'Degree' in degree_name: degree_name = degree_name degree_level = 1 else: degree_name = None degree_level = None # print(degree_name,url) #9.duration #10.duration_per duration = response.xpath( "//div[contains(@class,'plan-offering-header')]//ul//li[2]" ).extract() duration = ''.join(duration) duration = remove_tags(duration) try: duration = re.findall('\d', duration)[0] except: duration = None duration_per = 1 # print(duration,url) #11.tuition_fee tuition_fee = '14,832' #12.tuition_fee_pre tuition_fee_pre = '$' #13.start_date start_date = response.xpath('//figure/table//tr/td[1]').extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = re.findall('([a-zA-Z]+\s\d+)', start_date) start_date = set(start_date) start_date = ','.join(start_date).replace( 'Jan 2019', '2019-01').replace('May 2019', '2019-05').replace( 'Sep 2019', '2019-09').replace('Jan 2020', '2020-01') start_date = start_date.replace('May 2020', '2020-05') # print(start_date) #14.campus try: campus = response.xpath('//figure/table//tr/td[2]').extract()[0] campus = re.findall('<span>(.*?)</span>', campus) campus = ','.join(campus) except: campus = None # print(campus) #15.overview_en overview_en = response.xpath( '//*[@id="main"]/article/div[1]/div/div[2]').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en).replace( 'Program Summary (PDF)', '') # print(overview_en) #16.modules_en modules_en_url = url + '/courses/' # print(modules_en_url) if len(modules_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data2 = requests.get(modules_en_url, headers=headers) response2 = etree.HTML(data2.text) modules_en = response2.xpath( '//*[@id="main"]/article/div[1]/div/div[2]/div/div') doc2 = "" if len(modules_en) > 0: for a in modules_en: doc2 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc2 = remove_class(doc2) modules_en = doc2 else: modules_en = None else: modules_en = None # print(modules_en) #17.career_en career_en_url = url + '/career-opportunities' # print(modules_en_url) if len(career_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data3 = requests.get(career_en_url, headers=headers) response3 = etree.HTML(data3.text) career_en = response3.xpath( '//*[@id="main"]/article/div[1]/div/div[2]/div/div') doc3 = "" if len(career_en) > 0: for a in career_en: doc3 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc3 = remove_class(doc3) career_en = doc3 else: career_en = None else: career_en = None # print(career_en) #18.entry_requirements_en entry_requirements_en_url = url + '/admission-requirements' # print(entry_requirements_en) if len(entry_requirements_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data = requests.get(entry_requirements_en_url, headers=headers) response1 = etree.HTML(data.text) entry_requirements_en = response1.xpath( '//*[@id="main"]/article/div[1]/div/div[2]/div/div') doc = "" if len(entry_requirements_en) > 0: for a in entry_requirements_en: doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc = remove_class(doc) entry_requirements_en = doc else: entry_requirements_en = None else: entry_requirements_en = None # print(entry_requirements_en) #19.toefl 20212223 if degree_level == 3: toefl = 80 toefl_r = 20 toefl_w = 20 toefl_s = 20 toefl_l = 20 else: toefl = 88 toefl_r = 21 toefl_w = 21 toefl_s = 21 toefl_l = 21 #24.ielts 25262728 if degree_level == 3: ielts = 6 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 else: ielts = 6.5 ielts_r = 6 ielts_w = 6 ielts_l = 6 ielts_s = 6 #29.other other = 'other 1.部分专业没有学位名称2.duration部分官网页面上没有3.deadline未找到 4.申请费未找到' item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['department'] = department item['degree_name'] = degree_name item['degree_level'] = degree_level item['duration'] = duration item['duration_per'] = duration_per item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['start_date'] = start_date item['campus'] = campus item['overview_en'] = overview_en item['modules_en'] = modules_en item['career_en'] = career_en item['entry_requirements_en'] = entry_requirements_en item['toefl'] = toefl item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['toefl_s'] = toefl_s item['toefl_l'] = toefl_l item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['other'] = other yield item
def parse(self, response): item = getItem.get_item(ScrapyschoolCanadaCollegeItem) try: major_name_en = response.xpath('//*[@id="contentheader_content"]|//*[@id="body"]/div[2]/div[2]/h1').extract()[0] major_name_en = remove_tags(major_name_en) major_name_en = major_name_en.replace('\r\n','').replace(' ','').replace(' ','') #print(major_name_en) except: major_name_en = None #print(major_name_en) #programme_code try: campus_list = response.xpath('//li[contains(text(),"Campus:")]/following-sibling::li').extract() campus_list = ''.join(campus_list) campus_list = remove_tags(campus_list) campus_list = campus_list.replace('\r\n','') campus_list = campus_list.replace(' ','==') # if '-' in campus_list: # campus_list = re.sub('(-.*)==',campus_list) # else: # pass campus_list = campus_list.lstrip('==') campus_list = campus_list.split('==') #print(campus_list) #print(response.url) except: campus_list = None #print(campus_list) try: duration = response.xpath('//li[contains(text(),"Program Length")]/following-sibling::li').extract()[0] duration = remove_tags(duration) if 'Four Year' in duration: duration = '4' elif 'Two Year' in duration: duration = '2' elif 'One Year' in duration: duration = '1' elif 'weeks' in duration: duration = 'No' elif 'Three Year' in duration: duration = '3' else: duration = None #print(duration) except: duration = None #print(duration) #1.学校名称 school_name = 'St. Clair College' #4. 学院 try: department = response.xpath('//*[@id="block-nu-breadcrumbs"]/div/nav/ol/li[4]/a').extract()[0] department = remove_tags(department,keep=("i")) #print(len(department)) # print(department) #print(response.url) except: department = None #print(department) # 4. try: degree_name = response.xpath('//li[contains(text(),"Program Length")]/following-sibling::li').extract()[0] #degree_name_list = remove_tags(degree_name_list,keep=('li','ul')) #degree_name_list = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_name_list) #degree_name_list = degree_name_list.replace('\t','').replace('\n','').replace('\xa0','').replace(' class="list-inline uofs-cta-list','') # degree_name_list = degree_name_list.replace('<li>','').replace('</li>','---') # degree_name_list = degree_name_list.replace('<span>','').replace('</span>','---') # degree_name_list = degree_name_list.split('</li><li>') degree_name = remove_tags(degree_name) D = ['http://www.stclaircollege.ca/programs/postsec/hr_management/', 'http://www.stclaircollege.ca/programs/postsec/international_bus_mgmt/', 'http://www.stclaircollege.ca/programs/postsec/supply_chain_management/', 'http://www.stclaircollege.ca/programs/postsec/autism/', 'http://www.stclaircollege.ca/programs/postsec/autism_alternate', 'http://www.stclaircollege.ca/programs/postsec/child_youth_acc/', 'http://www.stclaircollege.ca/programs/postsec/dsw_acc/', 'http://www.stclaircollege.ca/programs/postsec/ece_acc/', 'http://www.stclaircollege.ca/programs/postsec/event_management/', 'http://www.stclaircollege.ca/programs/postsec/media_convergence/', 'http://www.stclaircollege.ca/programs/postsec/wia/', 'http://www.stclaircollege.ca/programs/postsec/borderservices_ft/', 'http://www.stclaircollege.ca/programs/postsec/paralegal_accelerated/', 'http://www.stclaircollege.ca/programs/postsec/police_ft/', 'http://www.stclaircollege.ca/programs/postsec/psi_ft/',] if 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = '4' elif response.url in D: degree_level = '2' if 'Graduate Certificate' in degree_name: degree_name = 'Post_graduate(Certificate)' elif 'Diploma' in degree_name: degree_name = 'Post-Diploma' else: degree_name = None #print(degree_name) elif 'Certificate' in degree_name: degree_name = 'No' degree_level = '0' else: degree_level = '3' degree_name = 'Diploma' # print(degree_name) #elif '' #print(response.url) except: degree_name = None #print(degree_name) # #5.学位描述 try: degree_overview_en = response.xpath('//*[@id="rightcolumn_inner"]').extract() degree_overview_en = ''.join(degree_overview_en) #degree_overview_en = remove_tags(degree_overview_en) degree_overview_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',degree_overview_en) #degree_overview_en = degree_overview_en.replace('\r\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace('\n','') #degree_overview_en = degree_overview_en.replace(' ',' ') #degree_overview_en = degree_overview_en.replace(' ','') #degree_overview_en = degree_overview_en.replace(' ','') degree_overview_en = degree_overview_en.replace('\n','') degree_overview_en = re.findall('PROGRAM OVERVIEW(.*)PROGRAM HIGHLIGHTS',degree_overview_en)[0] degree_overview_en = degree_overview_en.replace('\t','').replace('\r','') #print(degree_overview_en) except: degree_overview_en = None #print(degree_overview_en) #6.专业英文 #7.专业介绍 try: #overview_en = degree_overview_en overview_en = degree_overview_en # print(overview_en) except: overview_en = degree_overview_en # print(overview_en) #8.入学时间 try: start_date = response.xpath('//li[contains(text(),"Starts")]/following-sibling::li').extract()[0] start_date = remove_tags(start_date) start_date = start_date.replace('September','2019-09').replace('January','2019-01').replace('May','2019-05').replace('July','2019-07').replace('March','2019-03').replace('October','2019-10').replace(' and ',',').replace(' & ',',').replace(', ',',').replace(' (Semester 3)','').replace(' (Semester 4)','').replace(' (Windsor only)','') # start_date = ','.join(start_date) # start_date = remove_tags(start_date) # start_date = start_date.replace('Spring','').replace('Winter','').replace('Summer','').replace('Fall','') # start_date = start_date.replace('September 2019','2019-09').replace('May 2019','2019-05').replace('July 2019','2019-07').replace('January 2020','2020-01').replace('January 2019','2019-01') #print(start_date) except: start_date = None #print(start_date) #9.课程长度 # try: # duration = response.xpath('').extract()[0] # duration = remove_tags(duration) # # print(duration) # except: # duration = None # # print(duration) #10.课程设置 try: modules_en = response.xpath('//h4[contains(text(),"REQUIREMENTS")]/following-sibling::div').extract() modules_en = ''.join(modules_en) modules_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',modules_en) modules_en = modules_en.replace('\r\n','').replace('\n','').replace('\t','').replace(' ','').replace(' ','') #print(modules_en) except: modules_en = None #print(modules_en) #11.就业方向 try: career_en = response.xpath('//*[@id="rightcolumn_inner"]').extract()[0] career_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',career_en) career_en = career_en.replace('\r\n','').replace('\n','').replace('\t','').replace(' ','').replace(' ','') career_en = re.findall('CAREER OPPORTUNITIES(.*)ADDITIONAL INFORMATION',career_en)[0] # print(career_en) except: career_en = None # print(career_en) #12.截止日期 try: if '2019-01' in start_date: deadline = '2019-04-26' else: deadline = None # deadline = response.xpath('//*[@id="Admissionrequirementsanddeadlines-subsection-0"]/table/tbody/tr/td[3]').extract() # deadline = '---'.join(deadline) # deadline = remove_tags(deadline) # deadline = deadline.replace('Documents due: ', '') # deadline = deadline.replace('Sep 1, 2018Oct 1, 2018','2018-09-01').replace('Feb 1, 2019Mar 1, 2019','2019-02-01').replace('Mar 1, 2019Apr 1, 2019','2019-03-01').replace('May 1, 2019Jun 1, 2019','2019-05-01').replace('Sep 1, 2019Oct 1, 2019','2019-09-01').replace('Feb 15, 2019Mar 1, 2019','2019-02-15').replace('---',',') # #deadline = remove_tags(deadline) # #print(deadline) except: deadline = None #print(deadline) #13.学费 try: tuition_fee = 'http://www.stclaircollege.ca/programs/postsec/docs/fees/Tuition-Fee-Sheet-2018-19.pdf' #tuition_fee = remove_tags(tuition_fee) #tuition_fee = tuition_fee.replace('$','') #print(tuition_fee) except: tuition_fee = None #print(tuition_fee) #14 申请费: apply_fee = '125' try: entry_requirements_en = response.xpath('//*[@id="rightcolumn_inner"]').extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',entry_requirements_en) entry_requirements_en = entry_requirements_en.replace('\r\n','') entry_requirements_en = re.findall('ADMISSION REQUIREMENTS(.*)PROGRAM OVERVIEW',entry_requirements_en)[0] #print(entry_requirements_en) #print(abc) except: entry_requirements_en = None #print(entry_requirements_en) #16 中国学生申请要求 try: require_chinese_en = '<div>If you are not a Canadian citizen or landed immigrant, you will need a Canadian Study Permit to study in Canada and a Letter of Acceptance - issued by the College - in order to apply for the permit.<br><br>General diploma programs admission requirement for international students are as follows:<br><br><b>Canada & USA:</b> High school diploma and Grade 12 transcript, or equivalent.<br><br><b>British education system:</b> General Certificate of Education showing passes in six (6) academic subjects (including English) at the Ordinary level.<br><br><b>Caribbean countries (English):</b> High school diploma and transcript, or equivalent.<br><br><b>West Africa (Nigeria, Ghana, etc.):</b> WAEC (or NECO) transcript with online verifying scratch card number and PIN required.<br><br><b>South Asia (Pakistan, Bangladesh, Nepal, Sri Lanka, etc.):</b> High school diploma and Grade 12 transcript. IELTS 5.5 with no band lower than 5.0, or PTE (Pearson Test of English) Level 51, or TOEFL paper-based test (PBT) score of 500, or internet-based test (iBT) score of 61, or computer-based test (CBT) score of 173.<br><br><strong>India:</strong> High school diploma and Grade 12 transcript. IELTS 6.0 with no band lower than 5.5, or PTE (Pearson Test of English) Level 51, proficiency tests must have been taken within the previous two years from the date of submitting application.<br><br>Other regions (English proficiency test required)<ul> <li>High school diploma/graduation certificate and transcript or equivalent in original language and English translation.</li> <li>English proficiency requirement: TOEFL iBT 61, IELTS (overall band) 5.5, or PTE (Pearson Test of English) Level 51</li></ul>Note:<ul> <li>For information about IELTS, see <a href="/programs/coned/ielts.html">International English Language Testing System</a>.</li> <li>Applicants who do not have the minimum TOEFL or IELTS score will be required to write the English Proficiency Test (CanTEST) after arrival at the College. Students who do not pass the CanTEST will need to register in the St. Clair College English as a Second Language (ESL) program prior to entering post-secondary programs.</li> <li>Pharmacy Technician for the Pharmacy Technician Program.</li></ul></div>' require_chinese_en = remove_tags(require_chinese_en) # print(require_chinese_en) except: require_chinese_en = None # print(require_chinese_en) #17 特殊专业要求 try: specific_requirement_en = response.xpath('').extract()[0] # #specific_requirement_en = remove_tags(specific_requirement_en) # specific_requirement_en = re.sub(' [a-zA-Z\-]*=[\'\"].+?[\'\"]','',specific_requirement_en) # specific_requirement_en = specific_requirement_en.replace('\r\n','') # specific_requirement_en = re.findall('Required high school classes(.*)2.',specific_requirement_en)[0] # specific_requirement_en = remove_tags(specific_requirement_en,keep=("li","ul")) # #print(specific_requirement_en) except: specific_requirement_en = None #print(specific_requirement_en) #18 高考(官网要求) try: gaokao_desc = response.xpath('').extract()[0] gaokao_desc = remove_tags(gaokao_desc) # print(gaokao_desc) except: gaokao_desc = None # print(gaokao_desc) #19 高考(展示以及判断字段) try: gaokao_zs = response.xpath('').extract()[0] gaokao_zs = remove_tags(gaokao_zs) # print(gaokao_zs) except: gaokao_zs = None # print(gaokao_zs) #22 会考描述 try: huikao_desc = response.xpath('').extract()[0] huikao_desc = remove_tags(huikao_desc) # print(huikao_desc) except: huikao_desc = None # print(huikao_desc) #23 会考描述 try: huikao_zs = response.xpath('').extract()[0] huikao_zs = remove_tags(huikao_zs) # print(huikao_zs) except: huikao_zs = None # print(huikao_zs) #25 雅思要求 try: ielts_desc = 'English proficiency requirement: TOEFL iBT 61, IELTS (overall band) 5.5, or PTE (Pearson Test of English) Level 51' #ielts_desc = remove_tags(ielts_desc) # print(ielts_desc) except: ielts_desc = None # print(ielts_desc) #26 ielts try: ielts = '5.5' #ielts = re.findall('\d\.\d',ielts) #ielts = remove_tags(ielts) #print(ielts) except: ielts = None #print(ielts) #27 ielts_? ielts_l = '' ielts_s = '' ielts_r = '' ielts_w = '' #28 toefl_code try: toefl_code = None #toefl_code = remove_tags(toefl_code) # print(toefl_code) except: toefl_code = None # print(toefl_code) #29 toefl_desc try: toefl_desc = 'English proficiency requirement: TOEFL iBT 61, IELTS (overall band) 5.5, or PTE (Pearson Test of English) Level 51' #toefl_desc = remove_tags(toefl_desc) # print(toefl_desc) except: toefl_desc = None # print(toefl_desc) #30 toefl try: toefl = '61' #toefl = re.findall('\d\d',toefl) #toefl = remove_tags(toefl) #print(toefl) except: toefl = None # print(toefl) #31 toefl_? toefl_l = None toefl_s = None toefl_r = None toefl_w = None #34 ap try: ap = '' ap = remove_tags(ap) # print(ap) except: ap = None # print(ap) #35 面试描述 try: interview_desc_en = response.xpath('').extract()[0] interview_desc_en = remove_tags(interview_desc_en) # print(interview_desc_en except: interview_desc_en = None # print(interview_desc_en) #36 作品集描述 try: portfolio_desc_en = response.xpath('').extract()[0] portfolio_desc_en = remove_tags(portfolio_desc_en) # print(portfolio_desc_en except: portfolio_desc_en = None # print(portfolio_desc_en) #37 other try: other = '1.http://www.stclaircollege.ca/registrar/ ——》deadline 2.specific_requirement_en(确认没有) 3.http://www.stclaircollege.ca/international/admissionpolicies.html ——》雅思 托福 确认没有小分(没有托福code)4.average_score:等待老师给经验值4.http://www.stclaircollege.ca/programs/postsec/docs/fees/Tuition-Fee-Sheet-2018-19.pdf 4,学费联系校代' #other = remove_tags(other) # print(other) except: other = None # print(other) #平均分 average_score average_score = '' # degree_name_desc try: degree_name_desc = degree_overview_en except: degree_name_desc = None item['school_name'] = school_name #item['location'] = location # item['campus'] = campus item['department'] = department item['degree_name'] = degree_name item['degree_name_desc'] = degree_name_desc item['major_name_en'] = major_name_en # item['programme_code'] = programme_code item['overview_en'] = overview_en item['start_date'] = start_date item['duration'] = duration item['duration_per'] = '1' item['modules_en'] = modules_en item['career_en'] = career_en item['deadline'] = deadline item['apply_pre'] = 'CAD$' item['apply_fee'] = apply_fee item['tuition_fee_pre'] = 'CAD$' item['tuition_fee'] = tuition_fee item['tuition_fee_per'] = '1' item['entry_requirements_en'] = entry_requirements_en item['require_chinese_en'] = require_chinese_en item['specific_requirement_en'] = specific_requirement_en item['average_score'] = average_score item['gaokao_desc'] = gaokao_desc item['gaokao_zs'] = gaokao_zs item['huikao_desc'] = huikao_desc item['huikao_zs'] = huikao_zs item['ielts_desc'] = ielts_desc item['ielts'] = ielts item['ielts_l'] = ielts_l item['ielts_s'] = ielts_s item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['toefl_code'] = toefl_code item['toefl_desc'] = toefl_desc item['toefl'] = toefl item['toefl_l'] = toefl_l item['toefl_s'] = toefl_s item['toefl_r'] = toefl_r item['toefl_w'] = toefl_w item['interview_desc_en'] = interview_desc_en item['portfolio_desc_en'] = portfolio_desc_en item['other'] = other item['url'] = response.url item['degree_level'] = degree_level for cam in campus_list: campus = cam campus = re.sub('( -.*)','',campus) item["campus"] = campus try: item["campus"] = re.findall('(.*)\(.*\)',campus)[0] item["programme_code"] = re.findall('\((.*)\)',campus)[0] item["location"] = item["campus"] except: item["campus"] = 'No' item["programme_code"] = 'No' item["location"] = 'No' if 'No' not in item["campus"] and 'No' not in item["programme_code"] and 'No' not in item["location"] and 'No' not in item["degree_name"] and 'No' not in item['duration']: yield item else: pass #print(item["programme_code"]) #print(item["campus"]) #print(campus) # yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'George Brown College' # print(school_name) #2.url url = response.url # print(url) # 3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath( "//div[contains(text(),'Program name')]//following-sibling::*" ).extract() major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en).replace('& ', '') # print(major_name_en) #5.programme_code programme_code = response.xpath( "//div[contains(text(),'Code')]//following-sibling::*").extract() programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code) # print(programme_code) #6.duration #7.duration_per duration = response.xpath( "//div[contains(text(),'Duration')]//following-sibling::*" ).extract() duration = ''.join(duration) duration = remove_tags(duration) duration = re.findall('\d+\s[a-zA-Z]+', duration)[0] # print(duration) if 'year' in duration: duration_per = 1 elif 'semester' in duration: duration_per = 2 elif 'month' in duration: duration_per = 3 elif 'weeks' in duration: duration_per = 4 else: duration_per = None duration = re.findall('\d+', duration)[0] if duration_per == None: if duration == 1: duration_per = 2 else: duration_per = 3 # print(duration,'*************',duration_per) #8.department department = response.xpath( "//div[contains(text(),'School')]//following-sibling::*").extract( ) department = ''.join(department) department = remove_tags(department) department = clear_space_str(department) # print(department) #9.degree_name #10.degree_level degree_name = response.xpath( "//div[contains(text(),'Credential')]//following-sibling::*" ).extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) degree_name = clear_space_str(degree_name) # print(degree_name) if 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = 3 elif 'Diploma' in degree_name: degree_name = 'Diploma' degree_level = 3 elif 'Graduate Certificate' in degree_name: degree_name = 'Graduate Certificate' degree_level = 2 else: degree_level = 1 degree_name = degree_name # print(degree_name,degree_level) #11.start_date start_date = response.xpath( "//div[contains(text(),'Starting month')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) if 'September, January, May' in start_date: start_date = '2019-01,2019-05,2019-09' elif 'September, January' in start_date: start_date = '2019-01,2019-09' elif 'May' in start_date: start_date = '2019-05' elif 'September' in start_date: start_date = '2019-09' elif 'January' in start_date: start_date = '2019-01' else: start_date = None # print(start_date) #12.campus campus = response.xpath( "//div[contains(text(),'Location')]//following-sibling::*" ).extract() campus = ''.join(campus) campus = remove_tags(campus) campus = clear_space_str(campus) # print(campus) #13.overview_en overview_en = response.xpath( '//*[@id="overview-intro"]/div[1]/div').extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #14.modules_en modules_en = response.xpath( '//*[@id="coursesContent"]/div[1]').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #15.career_en career_en = response.xpath( '//*[@id="careersContent"]/div[1]').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #16.entry_requirements_en entry_requirements_en = response.xpath( '//*[@id="admReqsContent"]/div[1]/div[1]').extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = remove_class(entry_requirements_en) # print(entry_requirements_en) #17.tuition_fee if degree_level == 1: tuition_fee = '16,500' else: tuition_fee = '13,520' #18.tuition_fee_pre tuition_fee_pre = '$' #19.apply_fee apply_fee = '95' #20.apply_pre apply_pre = '$' #21.ielts_desc 2223242526 ielts_desc = 'Diploma/Certificate Programs:6.0, minimum 5.5 in each skill band;Postgraduate Programs and Fast-track/ Bridges**:6.5, minimum 6.0 in each skill band;GBC Degree Programs*:6.5, minimum 6.0 in each skill band' if degree_level == 1: ielts = 6.5 ielts_r = 6 ielts_w = 6 ielts_s = 6 ielts_l = 6 else: ielts = 6 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 #27.deadline deadline = '2019-05-31' #28.toefl_desc 2930313233 toefl_desc = 'Diploma/Certificate Programs:80 (online) minimum 20 in each skill band,Postgraduate Programs and Fast-track/ Bridges**:88 (online) minimum 22 in each skill band,GBC Degree Programs*:84 (online) minimum 21 in each skill band' if degree_level == 1: toefl = 84 toefl_r = 21 toefl_w = 21 toefl_s = 21 toefl_l = 21 else: toefl = 80 toefl_r = 20 toefl_w = 20 toefl_s = 20 toefl_l = 20 item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['duration'] = duration item['duration_per'] = duration_per item['department'] = department item['degree_name'] = degree_name item['degree_level'] = degree_level item['start_date'] = start_date item['campus'] = campus item['overview_en'] = overview_en item['modules_en'] = modules_en item['career_en'] = career_en item['entry_requirements_en'] = entry_requirements_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['ielts_desc'] = ielts_desc item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['deadline'] = deadline item['toefl_desc'] = toefl_desc item['toefl_r'] = toefl_r item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['toefl_l'] = toefl_l item['ielts'] = ielts item['toefl'] = toefl yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Algonquin College" item['url'] = response.url # print(response.status) print("===========================") print(response.url) # item['campus'] = '' # item['location'] = '11762 - 106 Street Edmonton, Alberta, Canada, T5G 2R1' item['other'] = """问题描述: 1.overview、modules、career空缺的是页面上没有""" # item['require_chinese_en'] = '' item['start_date'] = "2019-01,2019-05,2019-09" item['deadline'] = "" # https://www.algonquincollege.com/international/future-students-2018/admissions-apply/ item['apply_pre'] = "CAD$" item['apply_fee'] = '95' try: major_name_en = response.xpath("//div[@class='program_title']/h1//text()|" "//span[@id='programNamePlain']//text()").extract() clear_space(major_name_en) if len(major_name_en) > 0: item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) department = response.xpath("//div[@y='col-md-10 breadcrumb']//span[2]//text()|" "//div[@class='col-md-10 breadcrumb']//span[2]//text()").extract() clear_space(department) if len(department) > 0: item['department'] = ''.join(department).strip() # print("item['department']: ", item['department']) campus = response.xpath("//strong[contains(text(),'Campus:')]/following-sibling::*//text()").extract() clear_space(campus) if len(campus) > 0: item['campus'] = ''.join(campus).strip() # print("item['campus']: ", item['campus']) if item['campus'] == "Pembroke": item['location'] = "1 College Way Pembroke, Ontario K8A 0C8" elif item['campus'] == "Perth": item['location'] = "7 Craig Street Perth, Ontario K7H 1X7" elif item['campus'] == "Ottawa": item['location'] = "1385 Woodroffe Avenue Ottawa, Ontario K2G 1V8" programme_code = response.xpath("//strong[contains(text(),'Program Code:')]/following-sibling::*//text()").extract() clear_space(programme_code) if len(programme_code) > 0: item['programme_code'] = ''.join(programme_code).strip() # print("item['programme_code']: ", item['programme_code']) degree_name = response.xpath("//strong[contains(text(),'Credential:')]/following-sibling::*//text()").extract() clear_space(degree_name) # print("degree_name: ", degree_name) if len(degree_name) > 0: item['degree_name'] = ''.join(degree_name).replace("Ontario College", "").strip() # print("item['degree_name']: ", item['degree_name']) if item['degree_name'] is not None: if "diploma" in item['degree_name'].lower(): item['degree_level'] = 3 elif "Graduate" in item['degree_name']: item['degree_level'] = 2 elif "Degree" in item['degree_name']: if "Honours" in item['degree_name'] and "Honours" not in item['major_name_en']: item['degree_name'] = "Honours " + item['major_name_en'] else: item['degree_name'] = item['major_name_en'] item['degree_level'] = 1 if "Bachelor of" in item['major_name_en'] and item['degree_name'] is None: item['degree_name'] = item['major_name_en'] item['degree_level'] = 1 print("item['degree_name']1: ", item['degree_name']) # print("item['degree_level']: ", item['degree_level']) # duration duration = response.xpath("//strong[contains(text(),'Duration:')]/following-sibling::*//text()").extract() clear_space(duration) # print("duration: ", duration) duration_str = ''.join(duration).strip() duration_re = re.findall(r"[\d]+", duration_str) # print("duration_re: ", duration_re) item['duration'] = ''.join(duration_re) # 判断课程长度单位 if "year" in ''.join(duration).lower(): item['duration_per'] = 1 if "month" in ''.join(duration).lower(): item['duration_per'] = 3 if "week" in ''.join(duration).lower(): item['duration_per'] = 4 # print("item['duration']: ", item['duration']) # print("item['duration_per']: ", item['duration_per']) overview = response.xpath( "//strong[contains(text(),'Bring Your Own Device (BYOD)')]/../preceding-sibling::*").extract() if len(overview) == 0: overview = response.xpath("//h3[contains(text(),'SUCCESS FACTORS')]/preceding-sibling::*|" "//h2[contains(text(),'Program Eligibility')]/preceding-sibling::*[position()<last()-1]|" "//strong[contains(text(),'Program Eligibility')]/../preceding-sibling::*[position()<last()-1]").extract() if len(overview) > 0: item['overview_en'] = remove_class(clear_lianxu_space(overview)) # print("item['overview_en']: ", item['overview_en']) modules = response.xpath( "//div[@id='courses']//*[@class='level-container']").extract() # print("modules_en: ", modules) if len(modules) > 0: item['modules_en'] = remove_class(clear_lianxu_space(modules)).replace("<div>Hours</div>", "").strip() print("item['modules_en']: ", item['modules_en']) del_key1 = re.findall(r"<p><span>.*?Read More</p>", item['modules_en']) print("del_key1: ", del_key1) del_key2 = re.findall(r"<p>[\d\.]+?</p>", item['modules_en']) print("del_key2: ", del_key2) if len(del_key1) > 0: for d_k1 in del_key1: item['modules_en'] = item['modules_en'].replace(d_k1, '').strip() if len(del_key2) > 0: for d_k2 in del_key2: item['modules_en'] = item['modules_en'].replace(d_k2, '').strip() print("item['modules_en']=== ", item['modules_en']) # print("item['modules_en']: ", item['modules_en']) career = response.xpath( "//h3[contains(text(),'Learning Outcomes')]/preceding-sibling::*").extract() if len(career) > 0: item['career_en'] = remove_class(clear_lianxu_space(career)) # print("item['career_en']: ", item['career_en']) entry_requirements_en = response.xpath( "//div[@id='application_admission']//div[@class='col-sm-8']").extract() if len(entry_requirements_en) > 0: item['entry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements_en)) # print("item['entry_requirements_en']: ", item['entry_requirements_en']) # ielts = response.xpath("//p[contains(text(), 'IELT')]//text()").extract() # print("ielts: ", ielts) if item['entry_requirements_en'] is not None: ielts_list = re.findall(r"IELTS\-[\w\W]{1,120}", item['entry_requirements_en']) # print("ielts_list: ", ielts_list) if len(ielts_list) > 0: item['ielts_desc'] = ielts_list[0] ielts_dict = get_ielts(ielts_list[0]) item['ielts'] = ielts_dict.get("IELTS") item['ielts_l'] = ielts_dict.get("IELTS_L") item['ielts_s'] = ielts_dict.get("IELTS_S") item['ielts_r'] = ielts_dict.get("IELTS_R") item['ielts_w'] = ielts_dict.get("IELTS_W") toefl_list = re.findall(r"TOEFL\-Internet\-based[\w\W]{1,120}", item['entry_requirements_en']) # print("toefl_list: ", toefl_list) if len(toefl_list) > 0: item['toefl_desc'] = toefl_list[0] if item['toefl_desc'] is not None: toefl = re.findall(r"overall\s\d+", item['toefl_desc'], re.I) # print("toefl: ", toefl) toefl_ench = re.findall(r"\d+\sin each component", item['toefl_desc'], re.I) # print("toefl_each: ", toefl_ench) item['toefl'] = ''.join(toefl).lower().replace("overall", "").strip() item['toefl_l'] = ''.join(toefl_ench).lower().replace("in each component", "").strip() item['toefl_s'] = item['toefl_l'] item['toefl_r'] = item['toefl_l'] item['toefl_w'] = item['toefl_l'] # print("item['ielts_desc']: ", item['ielts_desc']) # print("item['ielts']: ", item['ielts']) # print("item['ielts_l']: ", item['ielts_l']) # print("item['ielts_s']: ", item['ielts_s']) # print("item['ielts_r']: ", item['ielts_r']) # print("item['ielts_w']: ", item['ielts_w']) # print("item['toefl_desc']: ", item['toefl_desc']) # print("item['toefl']: ", item['toefl']) # print("item['toefl_l']: ", item['toefl_l']) # print("item['toefl_s']: ", item['toefl_s']) # print("item['toefl_r']: ", item['toefl_r']) # print("item['toefl_w']: ", item['toefl_w']) item['tuition_fee_per'] = "1" item['tuition_fee_pre'] = "CAD$" if item['degree_level'] == 3: item['require_chinese_en'] = """<p>Admission To Postsecondary Certificate or Diploma Program (1/2/3 years)</p> <p>• One of the following:</p> <p> 1. The National Senior High School Examination with a minimum grade of 65% or C in relevant subjects (School Leaving Certificate).</p> <p> 2. Graduation Certificate awarded by senior (upper) middle school; may be academic or vocationally oriented with a minimum of C or 65% in relevant subjects.</p> <p> 3. Matriculation Examination with a minimum mark of 490. </p> <p>• English Proficiency requirements are as follows for consideration:</p> <p> 1. A minimum TOEFL score of 80 (internet based with no single test score below 20) </p> <p> 2. Or IELTS with an overall minimum score of 6.0 (with no single test score below 5.5) </p> <p> 3. Or CAEL (Canadian Academic English Language Assessment) with an overall band score of 60</p>""" elif item['degree_level'] == 2: item['require_chinese_en'] = """<p>Admission To A Post Graduate Certificate Program (1 year)</p> <p>• Bachelor’s degree and University transcripts</p> <p>• English Proficiency requirements are as follows for consideration:</p> <p> 1. A minimum TOEFL score of 88 (internet based with no single test score below 21) </p> <p> 2. Or an IELTS with an overall minimum score of 6.5 (with no single test score below 6.0) </p> <p> 3. Or CAEL with an overall band score of 70</p>""" elif item['degree_level'] == 1: item['require_chinese_en'] = """<p>Admission To Postsecondary Bachelor’s Degree Program (4 years)</p> <p>• One of the following:</p> <p> 1. The National Senior High School Examination with a minimum grade of 70% or B in relevant subjects (School Leaving Certificate).</p> <p> 2. Graduation Certificate awarded by senior (upper) middle school; may be academic or vocationally oriented with a minimum of B or 70% in relevant subjects.</p> <p> 3. Matriculation Examination with a minimum mark of 525. Include transcripts for any postsecondary courses or programs completed</p> <p>• Include transcripts for any post-secondary courses or programs completed</p> <p>• English Proficiency requirements are as follows for consideration:</p> <p> 1. A minimum TOEFL score of 88 (internet based with no single test score below 21) </p> <p> 2. Or an IELTS with an overall minimum score of 6.5 (with no single test score below 6.0) </p> <p> 3. Or CAEL with an overall band score of 70</p>""" if item['degree_level'] == 3 and item['ielts'] is None: item['ielts'] = '6.0' item['ielts_l'] = '5.5' item['ielts_s'] = '5.5' item['ielts_r'] = '5.5' item['ielts_w'] = '5.5' elif item['degree_level'] == 2 and item['ielts'] is None or item['degree_level'] == 1 and item['ielts'] is None: item['ielts'] = '6.5' item['ielts_l'] = '6.0' item['ielts_s'] = '6.0' item['ielts_r'] = '6.0' item['ielts_w'] = '6.0' if item['degree_level'] == 3 and item['toefl'] is None: item['toefl'] = '80' item['toefl_l'] = '20' item['toefl_s'] = '20' item['toefl_r'] = '20' item['toefl_w'] = '20' elif item['degree_level'] == 2 and item['toefl'] is None or item['degree_level'] == 1 and item['toefl'] is None: item['toefl'] = '88' item['toefl_l'] = '21' item['toefl_s'] = '21' item['toefl_r'] = '21' item['toefl_w'] = '21' fee_url = response.xpath("//a[contains(text(),'Tuition and Fee Estimator')]/@href").extract() # print("fee_url: ", fee_url) if len(fee_url) > 0: fee_url_tmp = 'https://www.algonquincollege.com' + ''.join(fee_url[0]) fee_url_tmp = fee_url_tmp.replace("residency=canadian&", "residency=international&").strip() # print("fee_url_tmp: ", fee_url_tmp) # //ul[@id='ui-id-2']//span[@class='feeTitle'][contains(text(),'International Student Premium (PRG)')]/following-sibling::span[@class='feeValue'] item['tuition_fee'] = self.parse_tuition_fee(fee_url_tmp) # print("item['tuition_fee']: ", item['tuition_fee']) yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write(str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) item['school_name'] = "Alberta College of Art and Design" item['url'] = response.url print("===========================") print(response.url) # item['campus'] = '' item['location'] = '1407 - 14 Ave. NW Calgary, Alberta, Canada T2N 4R3' item['other'] = """问题描述: 1.没有就业信息和课程长度""" # https://www.acad.ca/future-students/how-apply/how-apply-bachelors-degree/how-apply-if-youre-international-student item['apply_pre'] = "CAD$" item['apply_fee'] = '110' item['start_date'] = '2019-09' item['deadline'] = "2019-02-01" # https://www.acad.ca/future-students/how-apply/how-apply-bachelors-degree/academic-requirements/english-requirements item[ 'ielts_desc'] = 'A score of 6.5 or higher on the International English Language Test (IELTS)' item['ielts'] = '6.5' item[ 'toefl_desc'] = 'A score of 83 or higher on an official Test of English as a Foreign Language (TOEFL) on the Internet-based test (iBT)' item['toefl'] = '83' item['portfolio_desc_en'] = """ <div> <h1>Portfolio requirements and statement of intent</h1> <span></span><span></span> <p> Your portfolio has two major components, a statement of intent and samples of your work. Both are an important part of your application to ACAD, and a key way for the review committee to get to know more about you. </p> <h3> 1. Statement of intent</h3> <p> This is where you tell us all about you – your background, inspiration, and goals. Your statement of intent should be approximately 500 words in length and explain the following:</p> <ul> <li> Why do you want to study at ACAD?</li> <li> Why do you want to study visual art and design?</li> <li> What mediums or artists inspire you?</li> <li> How will you benefit from studies in art and design?</li> </ul> <h3> 2. Samples of your work</h3> <p> Here’s where you get to really show us your stuff! Choose examples that best represent your abilities, your personality, and be sure to follow these guidelines:</p> <ol> <li> Select 12 to 15 samples of your artwork.</li> <li> Include representational drawing examples, including one or two observational drawings of figure, landscape or still life.</li> <li> Include artwork created in a variety of mediums that explore different tools, techniques, and ideas.</li> <li> Demonstrate how you express ideas and concepts, preferably in work you’ve done on your own initiative outside of the classroom (for example, how do you respond creatively to current events, issues, or themes of personal interest?).</li> </ol> <p> Once you’ve written your personal statement and selected the work you want to include in your portfolio, follow our instructions on <a>how to photograph your portfolio</a><strong> </strong>and <a>how to submit your portfolio</a><a>.</a></p> </div> """ # https://www.acad.ca/current-students/pay-tuition-and-fees/undergraduate-tuition-and-fees item['tuition_fee'] = '14,934.9' item['tuition_fee_per'] = 1 item['tuition_fee_pre'] = 'CAD$' # https://acad.ca/future-students/how-apply-bachelors-degree/academic-requirements item[ 'entry_requirements_en'] = """<p>Applicants must possess a high school diploma, have achieved a grade of at least 60% in four grade 12 subjects, including a grade of 60% or higher your school’s highest-level English class (or equivalent), and meet English language proficiency requirements. There are also specific portfolio requirements for all applicants.</p>""" # https://www.acad.ca/future-students/how-apply/how-apply-bachelors-degree/how-apply-if-youre-international-student item[ 'require_chinese_en'] = """<p>To attend a Bachelors degree program at ACAD, you must have the equivalent of an Alberta high school diploma, with a minimum average grade of 60% (or equivalent) in your final year of studies.</p> <p>If you attended high school in a language other than English you’ll also need to meet our English language proficiency requirements for undergraduate students.</p>""" try: major_name_en = response.xpath( "//div[@class='large-8 content-con columns']/h1//text()" ).extract() clear_space(major_name_en) item['major_name_en'] = ''.join(major_name_en).strip() print("item['major_name_en']: ", item['major_name_en']) degree_name = response.xpath( "//a[@class='active-trail'][contains(text(),'Bachelors degrees')]/../ul//a[@class='active-trail']//text()" ).extract() clear_space(degree_name) print("degree_name: ", degree_name) item['degree_name'] = ''.join(degree_name).strip() if item['degree_name'] == "BFA": item['degree_name'] = "Bachelor of Fine Arts" elif item['degree_name'] == "BDes": item['degree_name'] = "Bachelor of Design" print("item['degree_name']: ", item['degree_name']) if item['degree_name'] == "Diploma": item['degree_level'] = 3 if "Bachelor" in item['degree_name']: item['degree_level'] = 1 if "Post" in item['degree_name']: item['degree_level'] = 2 print("item['degree_level']: ", item['degree_level']) if item['degree_level'] is not None: overview_en = response.xpath( "//h2[contains(text(),'Faculty')]/preceding-sibling::*[position()<last()-3]" ).extract() if len(overview_en) > 0: item['overview_en'] = remove_class( clear_lianxu_space(overview_en)) print("item['overview_en']: ", item['overview_en']) # career_en = response.xpath("//h3[@class='prepend-top']/../*[position()<last()]").extract() # if len(career_en) > 0: # item['career_en'] = remove_class(clear_lianxu_space(career_en)) # print("item['career_en']: ", item['career_en']) # modules_url = response.xpath("//li/a[@id='sidenav-child'][contains(text(), 'Courses')]/@href").extract() # # print("modules_url: ", modules_url) # if len(modules_url) > 0: # item['modules_en'] = self.parse_modules(modules_url[0]) # print("item['modules_en']: ", item['modules_en']) yield item except Exception as e: with open("scrapySchool_Canada_College/error/" + item['school_name'] + ".txt", 'a', encoding="utf-8") as f: f.write( str(e) + "\n" + response.url + "\n========================\n") print("异常:", str(e)) print("报错url:", response.url)
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Niagara College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath( '//*[@id="page-title"]/div[2]/div[2]/h3/span').extract() major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en).replace('& ', '') # print(major_name_en) #5.programme_code programme_code = response.xpath( "//span[contains(text(),'Code:')]//following-sibling::*").extract( ) programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code) # print(programme_code) #6.department department = response.xpath( '//*[@id="page-title"]/div[2]/div[1]/h2/div/a/span').extract() department = ''.join(department) department = remove_tags(department) if 'School of ' not in department: department = department.replace('School of', 'School of ') department = department.replace(' ', ' ') # print(department) #7.degree_name #8.degree_level degree_name = response.xpath( "//span[contains(text(),'Credential Awarded:')]//following-sibling::*" ).extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) if 'Advanced Diploma' in degree_name: degree_name = 'Advanced Diploma' degree_level = 3 elif 'Diploma' in degree_name: degree_name = 'Diploma' degree_level = 3 elif 'Graduate Certificate' in degree_name: degree_name = 'Graduate Certificate' degree_level = 2 else: degree_name = "Bachelor's Degree" degree_level = 1 # print(degree_name) #9.duration #10.duration_per duration = response.xpath( "//span[contains(text(),'Length:')]//following-sibling::*" ).extract() duration = ''.join(duration) duration = remove_tags(duration) if 'Year' in duration: duration = re.findall('\d', duration)[0] duration_per = 1 elif 'Months' in duration: duration = re.findall('\d+', duration)[0] duration_per = 3 else: if '3' in duration: duration = 3 duration_per = 2 elif 'Four' in duration: duration = 4 duration_per = 2 elif 'Six' in duration: duration = 6 duration_per = 2 else: duration = None duration_per = None # print(duration,'########',duration_per) #11.start_date start_date = response.xpath('//*[@id="status-table"]//tr').extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = re.findall('([A-Za-z]+\s[0-9]+)Open', start_date) start_date = ','.join(start_date).replace( 'September 2019', '2019-09').replace('January 2020', '2020-01').replace( 'May 2020', '2020-05').replace('January 2019', '2019-01').replace('May 2019', '2019-05') start_date = start_date.replace('Available', '').replace('Closed', '') # print(start_date) #12.campus campus = response.xpath( "//span[contains(text(),'Campus:')]//following-sibling::*" ).extract() campus = ''.join(campus) campus = remove_tags(campus).replace('& ', '') # print(campus) #13.overview_en overview_en = response.xpath( "//div[@class='program-overview']").extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #14.career_en career_en = response.xpath("//ul[@class='career-opp-list']").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #15.entry_requirements_en #16.specific_requirement_en entry_requirements_en_url = url + 'admission-requirements/' # print(entry_requirements_en) if len(entry_requirements_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data = requests.get(entry_requirements_en_url, headers=headers) response1 = etree.HTML(data.text) entry_requirements_en = response1.xpath( '//*[@id="content"]/div[1]') specific_requirement_en = response1.xpath( '//*[@id="content"]/div[2]') doc = "" doc1 = "" if len(entry_requirements_en) > 0: for a in entry_requirements_en: doc += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc = remove_class(doc) entry_requirements_en = doc else: entry_requirements_en = None if len(specific_requirement_en) > 0: for a in specific_requirement_en: doc1 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc1 = remove_class(doc1) specific_requirement_en = doc1 else: specific_requirement_en = None else: entry_requirements_en = None specific_requirement_en = None # print(entry_requirements_en) # print(specific_requirement_en) #17.modules_en modules_en_url = url + 'courses/' # print(modules_en_url) if len(modules_en_url) != 0: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } data2 = requests.get(modules_en_url, headers=headers) response2 = etree.HTML(data2.text) modules_en = response2.xpath('//table') doc2 = "" if len(modules_en) > 0: for a in modules_en: doc2 += (etree.tostring(a, encoding='unicode', pretty_print=False, method='html')) doc2 = remove_class(doc2) modules_en = doc2 else: modules_en = None else: modules_en = None # print(modules_en) #18.deadline deadline = '2019-02-01,2019-06-07,2019-11-01' #19.tuition_fee_pre tuition_fee_pre = "$" #20.tuition_fee if degree_level == 1: tuition_fee = '15,150' elif degree_level == 2: tuition_fee = '13,350' else: tuition_fee = '14,150' #21.apply_pre apply_pre = '$' #22.ielts 23242526 if degree_level == 3: ielts = 6.0 ielts_r = 5.5 ielts_w = 5.5 ielts_s = 5.5 ielts_l = 5.5 else: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_s = 6.0 ielts_l = 6.0 #23.toefl 2425 if degree_level == 3: toefl = 79 toefl_w = 20 toefl_s = 20 else: toefl = 79 toefl_w = 22 toefl_s = 22 other = '1.申请费未找到。2.中国学生要求未找到' item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['department'] = department item['degree_name'] = degree_name item['degree_level'] = degree_level item['duration'] = duration item['duration_per'] = duration_per item['start_date'] = start_date item['campus'] = campus item['overview_en'] = overview_en item['career_en'] = career_en item['entry_requirements_en'] = entry_requirements_en item['specific_requirement_en'] = specific_requirement_en item['modules_en'] = modules_en item['deadline'] = deadline item['tuition_fee_pre'] = tuition_fee_pre item['tuition_fee'] = tuition_fee item['apply_pre'] = apply_pre item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['toefl'] = toefl item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['other'] = other yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Centennoal College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath( '//*[@id="programBannerTitle"]').extract() major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en).replace('& ', '') # print(major_name_en) #5.programme_code programme_code = response.xpath( "//span[contains(text(),'Program Code:')]//following-sibling::*" ).extract() programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code) # print(programme_code) #6.department department = response.xpath( "//span[contains(text(),'School:')]//following-sibling::*" ).extract() department = ''.join(department) department = remove_tags(department) # print(department) #7.degree_name degree_name = response.xpath( "//span[contains(text(),'Credential:')]//following-sibling::*" ).extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name) #8.duration #9.duration_per duration = response.xpath( "//span[contains(text(),'Program Length:')]//following-sibling::*" ).extract() duration = ''.join(duration) duration = remove_tags(duration) if 'year' in duration: duration = re.findall('\d', duration)[0] duration_per = 1 else: duration = re.findall('\d', duration)[0] duration_per = 2 # print(duration,'####',duration_per) #10.start_date start_date = response.xpath( "//span[contains(text(),'Start Date:')]//following-sibling::*" ).extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) if 'Fall, Winter, Summer' in start_date: start_date = '2019-01,2019-05,2019-09' elif 'Fall, Winter' in start_date: start_date = '2019-01,2019-09' else: start_date = '2019-09' # print(start_date) #11.campus campus = response.xpath( "//span[contains(text(),'Location:')]//following-sibling::*" ).extract() campus = ''.join(campus) campus = remove_tags(campus) # print(campus) #12.overview_en overview_en = response.xpath("//div[@id='tab-1']").extract() overview_en = ''.join(overview_en) overview_en = remove_class(overview_en).replace( '<span>Printer Friendly</span>', '') # print(overview_en) #13.modules_en modules_en = response.xpath("//div[@id='tab-2']").extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #14.career_en career_en = response.xpath("//div[@id='tab-3']").extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #15.degree_level if 'Diploma' in degree_name: degree_level = 3 elif 'Degree' in degree_name: degree_level = 1 elif 'Graduate Certificate' in degree_name: degree_level = 2 else: degree_level = None #16.entry_requirements_en entry_requirements_en = response.xpath("//div[@id='tab-4']").extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = remove_class(entry_requirements_en).replace( '<span>Printer Friendly</span>', '') # print(entry_requirements_en) #17.tuition_fee tuition_fee = response.xpath( "//td[contains(text(),'International')]//following-sibling::td[1]" ).extract() tuition_fee = ''.join(tuition_fee) tuition_fee = remove_tags(tuition_fee).replace('$', '').replace('.00', '') # print(tuition_fee) #18.tuition_fee_pre tuition_fee_pre = '$' #19.deadline deadline = '2019-01-11,2019-05-10,2019-09-06' #20.toefl_desc 2122232425 toefl_desc = 'Certificate and Diploma Programs *:80+ minimums of 20 for the Internet-based test,Degree and Diploma Programs **:84+ minimums of 21 for the Internet-based test,Other Programs ***:88+ minimums of 22 for the Internet based test' if 'Advanced Diploma' in degree_name: toefl = 84 toefl_r = 21 toefl_w = 21 toefl_s = 21 toefl_l = 21 elif 'Degree' in degree_name: toefl = 84 toefl_r = 21 toefl_w = 21 toefl_s = 21 toefl_l = 21 else: toefl = 80 toefl_r = 20 toefl_w = 20 toefl_s = 20 toefl_l = 20 #26.ielts_desc 2728293031 ielts_desc = 'Certificate and Diploma Programs *:6.0 with no band score less than 5.5,Degree and Diploma Programs **:6.5 with no band score less than 6.0' if 'Advanced Diploma' in degree_name: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_l = 6.0 ielts_s = 6.0 elif 'Degree' in degree_name: ielts = 6.5 ielts_r = 6.0 ielts_w = 6.0 ielts_l = 6.0 ielts_s = 6.0 else: ielts = 6 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 #32other other = '1.degree_name是本科学位的需要修改。2.中国学生要求待确认' #33.apply_pre apply_pre = '$' #34.apply_fee apply_fee = 95 item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['programme_code'] = programme_code item['department'] = department item['degree_name'] = degree_name item['duration'] = duration item['duration_per'] = duration_per item['start_date'] = start_date item['campus'] = campus item['overview_en'] = overview_en item['modules_en'] = modules_en item['career_en'] = career_en item['degree_level'] = degree_level item['entry_requirements_en'] = entry_requirements_en item['tuition_fee'] = tuition_fee item['tuition_fee_pre'] = tuition_fee_pre item['deadline'] = deadline item['toefl_desc'] = toefl_desc item['toefl'] = toefl item['toefl_s'] = toefl_s item['toefl_w'] = toefl_w item['toefl_l'] = toefl_l item['toefl_r'] = toefl_r item['ielts_desc'] = ielts_desc item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_w'] = ielts_w item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['other'] = other item['apply_pre'] = apply_pre item['apply_fee'] = apply_fee yield item
def parse(self, response): item = get_item(ScrapyschoolCanadaCollegeItem) #1.school_name school_name = 'Fanshawe College' # print(school_name) #2.url url = response.url # print(url) #3.location location = 'Ontario, Canada' #4.major_name_en major_name_en = response.xpath('//*[@id="page-title"]').extract() major_name_en = ''.join(major_name_en) major_name_en = remove_tags(major_name_en) # print(major_name_en) #5.overview_en overview_en = response.xpath('//*[@id="group_overview"]/div/div/div/div/p').extract()[:-1] overview_en = ''.join(overview_en) overview_en = remove_class(overview_en) # print(overview_en) #6.campus campus = response.xpath('//*[@id="group_more_info"]/div/div[1]/div/div/div[3]/em').extract() campus = ''.join(campus) campus = remove_tags(campus) campus = re.findall('Campus Code:(.*?)September',campus) campus = clear_space_list(campus) campus = ','.join(campus).replace('\xa0','') # print(campus) #7.start_date start_date = response.xpath('//*[@id="block-views-program-displays-pr-overview-bl"]/div/div/div[2]').extract() start_date = ''.join(start_date) start_date = remove_tags(start_date) start_date = re.findall('(2.*)',start_date) start_date = clear_space_list(start_date) start_date = set(start_date) start_date = ','.join(start_date).replace(' January','-01').replace(' February','-02').replace(' March','-03').replace(' April','-04').replace(' May','-05').replace(' June','-06').replace(' July','-07').replace(' August','-08').replace(' September','-09').replace(' October','-10').replace(' November','-11') # print(start_date,url) #8.degree_name 待修改 degree_name = response.xpath("//strong[contains(text(),'Credential')]//following-sibling::*").extract() degree_name = ''.join(degree_name) degree_name = remove_tags(degree_name) # print(degree_name,url) #9.programme_code programme_code = response.xpath("//strong[contains(text(),'Program Code')]//following-sibling::*").extract() programme_code = ''.join(programme_code) programme_code = remove_tags(programme_code) # print(programme_code) #10.department department = response.xpath("//strong[contains(text(),'Academic School')]//following-sibling::*").extract() department = ''.join(department) department = remove_tags(department) # print(department) #11.duration duration = response.xpath("//strong[contains(text(),'Duration Next')]//following-sibling::*").extract() duration = ''.join(duration) duration = remove_tags(duration) try: duration = re.findall('\d+',duration)[0] except: duration = None # print(duration,url) #12.duration_per duration_per = 4 #13.modules_en modules_en = response.xpath('//*[@id="group_courses"]/div').extract() modules_en = ''.join(modules_en) modules_en = remove_class(modules_en) # print(modules_en) #14.career_en career_en = response.xpath('//*[@id="group_careers"]/div').extract() career_en = ''.join(career_en) career_en = remove_class(career_en) # print(career_en) #15.entry_requirements_en entry_requirements_en = response.xpath('//*[@id="group_admission"]/div').extract() entry_requirements_en = ''.join(entry_requirements_en) entry_requirements_en = remove_class(entry_requirements_en) # print(entry_requirements_en) #16.apply_fee apply_fee = 100 #17.apply_pre apply_pre = 'CAD$' #18.ielts_desc 1920212223 ielts_desc = 'Overall score of 6.0 with no score less than 5.5 in any of the four bands' ielts = 6.0 ielts_r = 5.5 ielts_w = 5.5 ielts_l = 5.5 ielts_s = 5.5 #24.toefl_desc 25 toefl_desc = '550 paper-based test, 79 internet-based test' toefl = 79 #26.tuition_fee_pre tuition_fee_pre = 'CAD$' #27.other other = 'deadline,中国学生要求,degree_name需要拆分,学费需要pdf匹配' item['school_name'] = school_name item['url'] = url item['location'] = location item['major_name_en'] = major_name_en item['overview_en'] = overview_en item['campus'] = campus item['start_date'] = start_date item['degree_name'] = degree_name item['programme_code'] = programme_code item['department'] = department item['duration'] = duration item['duration_per'] = duration_per item['modules_en'] = modules_en item['career_en'] = career_en item['entry_requirements_en'] = entry_requirements_en item['apply_fee'] = apply_fee item['apply_pre'] = apply_pre item['ielts_desc'] = ielts_desc item['ielts'] = ielts item['ielts_r'] = ielts_r item['ielts_s'] = ielts_s item['ielts_l'] = ielts_l item['ielts_w'] = ielts_w item['toefl_desc'] = toefl_desc item['toefl'] = toefl item['tuition_fee_pre'] = tuition_fee_pre item['other'] = other yield item