def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url university = "Keele" programme = response.xpath('').extract() programme = ' '.join(programme)
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = "EXETER postgraduate study and research" print(2, university) department = 'NULL' country = 'UK' city = 'NULL' website = 'https://www.exeter.ac.uk' programme = response.xpath( '//div[@id="left-col"]//h1//text()').extract() programme = ''.join(programme) # Course = Course.replace('\r\n', '') print(3, programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath( '//div[@id="left-col"]/h1/text()').extract() degree_type = ''.join(degree_type) print(4, degree_type) duration = response.xpath( '//div[@class="panel-padding"]/table/tbody/tr/td/span[1]/text()' ).extract() duration = ''.join(duration) print(5, duration) start_date = response.xpath( '//div[@class="panel-padding"]//table//tbody//tr[4]//td//text()' ).extract() start_date = ''.join(start_date).replace('\r\n', '') print(6, start_date) location = response.xpath( '//div[@class="panel-padding"]//text()').extract() location = ''.join(location).replace('\r\n', '') print(7, location) ATAS = 'NULL' overview = response.xpath('//div[@id="Overview"]//text()').extract() overview = ''.join(overview).replace('\r\n', '') overview = overview.replace('\n', '') print(8, overview) mode = response.xpath( '//td[@class="exeter-course-duration"]/span/text()').extract() mode = ''.join(mode) print(9, mode) modules = response.xpath('//div[@id="myTabContent"]//text()').extract() modules = ''.join(modules).replace('\r\n', '') modules = modules.replace('\n', '') print(10, modules) teaching = 'NULL' assessment = response.xpath('//div[@id="Learning"]//text()').extract() assessment = ''.join(assessment).replace('\r\n', '') # teaching_assessment = teaching_assessment.replace('\n', '') print(11, assessment) career = response.xpath('//div[@id="Careers"]//text()').extract() career = ''.join(career).replace('\r\n', '') career = career.replace('\n', '') print(12, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' entry_requirements = response.xpath( '//div[@id="Entry-requirements"]//p[1]//text()').extract() entry_requirements = ''.join(entry_requirements) print(13, entry_requirements) chinese_requirements = 'NULL' TOEFL = response.xpath( '//div[@id="Entry-requirements"]//p[6]//text()').extract() TOEFL = ''.join(TOEFL) print(14, TOEFL) TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' IELTS = response.xpath( '//div[@id="Entry-requirements"]//p[5]//text()').extract() IELTS = ''.join(IELTS) print(15, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' tuition_fee = response.xpath( '//div[@class="highlight-panel-fees"]//ul//li[2]//text()').extract( ) tuition_fee = ''.join(tuition_fee).replace('\r\n', '') print(16, tuition_fee) Alevel = 'NULL' IB = 'NULL' # crawltime = datetime.datetime.now().strftime('%Y-%m-%d') # print(16,crawltime) GPA = 'UNLL' average_score = 'NULL' accredited_university = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' school_test = 'NULL' SATII = 'NULL' degree_description = 'NULL' SATI = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = response.xpath('//div[@class="span9"]//text()').extract() other = ''.join(other).replace('\r\n', '') other = other.replace('\n', '') print(17, other) create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(18, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Goldsmiths UNIVERSITY OF LONDON' print(2, university) department_str = response.xpath( '//div[@class="hero__content"]/ul[@class="split-list split-list--hero"]/li//text()' ).extract() department_str = ''.join(department_str) try: if "Department" in department_str: start = department_str.find("Department") department = department_str[start:] department = department[:50] item["department"] = department else: department = "NULL" except: department = "报错" print(3, department) country = 'UK' city = 'NULL' programme = response.xpath( '//div[@class="hero__content"]/h1/text()').extract() Programme = ''.join(programme) print(4, Programme) ucas_code = 'NULL' # Master = ''.join(Master) degree_type = response.xpath( '//div[@class="hero__content"]/h1/text()').extract() degree_type = ''.join(degree_type) print(5, degree_type) degree_level = '1' website = 'https://www.gold.ac.uk/pg' start_date = 'NULL' # start_date = response.xpath('//div[@class="rich-content rich-content-section full-wrap"]/p[6]/text()').extract() # start_date = ''.join(start_date) # print(5,start_date) overview = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]//text()' ).extract() overview = ''.join(overview).replace('\r\n', '') print(6, overview) mode = 'NULL' duration = response.xpath( '//div[@class="hero__content"]/ul[@class="split-list split-list--hero"]/li/text()' ).extract() duration = ''.join(duration).replace('\r\n', '') # Duration = Duration.replace(' ','') print(7, duration) modules = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]//text()' ).extract() modules = ''.join(modules).replace('\r\n', '') modules = modules.replace('\n', '') print(8, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]//text()' ).extract() assessment = ''.join(assessment).replace('\r\n', '') print(9, assessment) career = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]//text()' ).extract() career = ''.join(career).replace('\r\n', '') print(10, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' # tuition_fee = ''.join(tuition_fee).replace('\r\n','') # tuition_fee = tuition_fee.replace(' ','') # print(12, Tuition_Fee) location = 'NULL' # location = ''.join(location) # print(13,location) accredited_university = 'NULL' ATAS = 'NULL' GPA = 'NULL' IELTS_str = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]/p//text()' ).extract() IELTS_str = ''.join(IELTS_str).replace('\r\n', '') # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) try: if "IELTS" in IELTS_str: start = IELTS_str.find("IELTS") end = IELTS_str.find("If you need") IELTS = IELTS_str[start:end] # IELTS = IELTS[:80] item["IELTS"] = IELTS else: IELTS = 'NULL' except: IELTS = '报错' print(11, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' average_score = 'NULL' Alevel = 'NULL' IB = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]/ul/li/text()' ).extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') # EntryRequirements = EntryRequirements.replace(' ','') print(12, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'University of Worcester' print(2, university) country = 'UK' city = 'worcester' website = 'https://www.worcester.ac.uk' department = 'NULL' # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() programme_s = response.xpath( '//section[@class="pageHead"]/h1/text()').extract() programme_s = ''.join(programme_s) if len(programme_s) > 0: programme = programme_s else: programme = 'NULL' print(3, programme) ucas_code = 'NULL' degree_level = '1' # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() degree_type = response.xpath( '//section[@class="pageHead"]/h1/text()').extract() degree_type = ''.join(degree_type) degree_type = self.getDegree_type(degree_type) print(4, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(5,start_date) degree_description = 'NULL' # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract() overview = response.xpath( '//div[@class="body-copy"]/ul/li/text()').extract() overview = ''.join(overview) print(5, overview) mode = 'NULL' # mode = ''.join(mode).replace('\r\n','') # mode = mode.replace('\n','') # mode = mode.replace(' ','') # print(7,mode) duration = 'NULL' # duration = ''.join(duration).replace('\r\n','') # duration = duration.replace('\n','') # duration = duration.replace(' ','') # print(8,duration) modules = response.xpath( '//dd[@class="accordion__content rte"]//text()').extract() modules = ''.join(modules) # modules = modules.replace('\n','') print(6, modules) assessment = 'NULL' # assessment = ''.join(assessment) # print(7, assessment) teaching = 'NULL' career = response.xpath( '//*[@id="#content"]/div/div/dl/dd/div//text()').extract() career = ''.join(career) print(8, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' # tuition_fee = ''.join(tuition_fee) # # tuition_fee = tuition_fee.replace('\n','') # # tuition_fee = tuition_fee.replace(' ','') # print(9, tuition_fee) location = 'worcester' # location = ''.join(location) # print(13,location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS) # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(10, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath( '//*[@id="#content"]/div/div/dl/dd/div//text()').extract() how_to_apply = ''.join(how_to_apply) print(11, how_to_apply) entry_requirements = response.xpath( '//*[@id="#content"]/div/div/dl/dd/div//text()').extract() entry_requirements = ''.join(entry_requirements) # EntryRequirements = EntryRequirements.replace(' ','') print(12, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self,response): print('==================================',response.url) item = HooliItem() url = response.url print(url) university = 'Loughbrough University' department = 'NULL' country = 'UK' city = 'NULL' website = 'NULL' programme= response.xpath('//div[@class="hero__content"]//h1//text()').extract() programme= ' '.join(programme) print(programme,1) ucas_code = 'NULL' degree_level = '0' degree_type = 'NULL' start_date = 'NULL' overview = 'NULL' mode = 'NULL' duration = 'NULL' modules = response.xpath('//div[@class="toggle_container"]').extract() modules = ''.join(modules) modules = str(modules) print(2,modules) teaching = 'NULL' assessment = response.xpath('//div[@class="content-wrapper"]/p[3]/text()').extract() assessment = ' '.join(assessment) assessment = str(assessment) print(assessment, 3) career = 'NULL' application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' location = 'NULL' GPA = 'NULL' average_score = 'NULL' Alevel = 'NULL' IB = 'NULL' accredited_university = 'NULL' IELTS = 'NULL' IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' ATAS = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = 'NULL' chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Queen Mary Universty of London' print(2, university) department = 'NULL' country = 'UK' city = 'NULL' website = 'http://search.qmul.ac.uk' programme = response.xpath( '//section[@id="count"]/article/header//text()').extract() programme = ''.join(programme) print(3, programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath( '//section[@id="count"]/article/header/h2/text()').extract() degree_type = ''.join(degree_type) try: if "BSc" in degree_type: degree_type = 'Bsc' elif "MSc" in degree_type: degree_type = "MSc" elif "BA" in degree_type: degree_type = 'BA' elif "MNSW" in degree_type: degree_type = 'MNSW' elif "PGCert" in degree_type: degree_type = 'PGCert' elif "MBA" in degree_type: degree_type = 'MBA' elif "MA" in degree_type: degree_type = 'MA' elif "MComp" in degree_type: degree_type = 'MComp' elif "PhD" in degree_type: degree_type = 'PhD' elif "FdA" in degree_type: degree_type = 'FdA' elif "PGCE" in degree_type: degree_type = 'PGCE' elif "IFP" in degree_type: degree_type = 'IFP' elif "LLB" in degree_type: degree_type = 'LLB' elif "MHealth Res" in degree_type: degree_type = 'MHealth Res' elif "MRes" in degree_type: degree_type = 'MRes' elif "MMed" in degree_type: degree_type = 'MMed' elif "MSci" in degree_type: degree_type = 'MSci' elif "MCh" in degree_type: degree_type = 'MCh' elif "LLM" in degree_type: degree_type = "LLM" elif "Y2QF" in degree_type: degree_type = "Y2QF" elif "Y2QG" in degree_type: degree_type = "Y2QG" else: degree_type = 'Ordinary degree' except: degree_type = "NULL" print(4, degree_type) start_date = 'NULL' # Sstart_date = ''.join(start_date) # print(5,start_date) overview = response.xpath('//div[@id="first"]//text()').extract() overview = ''.join(overview).replace('\n', '') print(6, overview) mode = response.xpath( '//section[@id="count"]/article/header/h2/text()').extract() mode = ''.join(mode).replace('\r\n', '') mode = mode.replace(' ', '') print(7, mode) duration = response.xpath( '//section[@id="count"]/article/header/h2/text()').extract() duration = ''.join(duration).replace('\r\n', '') duration = duration.replace(' ', '') print(8, duration) modules = response.xpath('//div[@id="second"]//text()').extract() modules = ''.join(modules).replace('\r\n', '') modules = modules.replace('\n', '') print(9, modules) teaching = 'NULL' assessment = response.xpath('//div[@id="fourth"]//text()').extract() assessment = ''.join(assessment) print(10, assessment) career = 'NULL' # career = ''.join(career).replace('\n', '') # print(11, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee_s = response.xpath('//div[@id="fifth"]//text()').extract() tuition_fee_s = ''.join(tuition_fee_s).replace('\r\n', '') tuition_fee_s = tuition_fee_s.replace(' ', '') tuition_fee_s = self.getTuition_fee(tuition_fee_s) try: if tuition_fee_s > 0: tuition_fee = tuition_fee_s else: tuition_fee = 'NULL' except: tuition_fee = '报错!' print(12, tuition_fee) location = 'NULL' # location = ''.join(location) # print(13,location) GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS).replace('\r\n','') # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(14, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' ATAS = 'NULL' LSAT = 'NULL' MCAT = 'NULL' chinese_requirements = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//div[@id="third"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') # EntryRequirements = EntryRequirements.replace(' ','') print(15, entry_requirements) school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): item = HooliItem() # item = {} url = response.url print(url) print('----------------------------------------------------') # var1=url.split('/') # if 'ug' in var1: # print('----------------------------------------',response.url) # else: # print('```````````````````````') university = 'Goldsmiths University of London' print(1, university) department_str = response.xpath( '//*[@id="maincontent"]/article/header/div/div/div/div/div//text()' ).extract() department_str = ' '.join(department_str) try: if "Department" in department_str: start = department_str.find("Department") department = department_str[start:] department = department[:50] item["department"] = department else: department = "NULL" except: department = "报错" print(2, department) country = 'UK' city = "NULL" website = 'https://www.gold.ac.uk' programme = response.xpath( '//div[@class="hero__content"]//h1//text()').extract() programme = ''.join(programme) print(3, programme) ucas_code = response.xpath( '//ul[@class="split-list split-list--hero"]/li/text()').extract() ucas_code = ''.join(ucas_code) print(4, ucas_code) degree_level = '0' degree_type = response.xpath( '//div[@class="hero__content"]/h1/text()').extract() degree_type = ''.join(degree_type) print(5, degree_type) start_date = 'NULL' overview = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]/p/text()' ).extract() overview = ''.join(overview) print(6, overview) mode = 'NULL' duration = response.xpath( '//ul[@class="split-list split-list--hero"]/li/text()').extract() duration = ''.join(duration) print(7, duration) modules = response.xpath( '//div[@class="grid-push grid-push--two"]/div[@class="rich-content rich-content-section full-wrap"]/p/text()' ).extract() modules = ' '.join(modules) modules = str(modules) print(8, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]/p[11]/text()' ).extract() assessment = ' '.join(assessment) # Evaluation_method = Evaluation_method.replace('\n', '') assessment = str(assessment) print(9, assessment) career_lists = response.xpath( '//section[@class="section section--accordion"]//text()').extract( ) career_str = ' '.join(career_lists) if "Skills & careers" in career_str: cstart = career_str.find("Skills & careers") cend = career_str.find("Fees & funding") career = career_str[cstart:cend] item["career"] = career else: career = 'NULL' print(10, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' location = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = response.xpath( '//div[@class="hero__content"]/ul/li/text()').extract() Alevel = ' '.join(Alevel) print(11, Alevel) IB = response.xpath( '//div[@class="hero__content"]/ul/li/text()').extract() IB = ' '.join(IB) print(12, IB) IELTS_str = response.xpath( '//div[@class="rich-content rich-content-section full-wrap"]//p//text()' ).extract() IELTS_str = ' '.join(IELTS_str) try: if "IELTS" in IELTS_str: start = IELTS_str.find("IELTS") end = IELTS_str.find("If you need") IELTS = IELTS_str[start:end] # IELTS = IELTS[:80] item["IELTS"] = IELTS else: IELTS = 'NULL' except: IELTS = '报错' print(13, IELTS) IELTS_L = "NULL" IELTS_S = "NULL" IELTS_R = "NULL" IELTS_W = "NULL" TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'UNLL' ATAS = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = 'NULL' chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Loughborough University' print(2, university) department = 'NULL' country = 'UK' city = 'NULL' website = 'http://www.lboro.ac.uk' degree_level = '1' programme = response.xpath( '//div[@class="programme-column programme-details"]/h1[@id="top"]//text()' ).extract() programme = ''.join(programme) print(3, programme) degree_type = response.xpath( '//div[@class="programme-column programme-details"]/h1[@id="top"]/span/text()' ).extract() degree_type = ''.join(degree_type) print(4, degree_type) ucas_code = 'NULL' start_date = response.xpath( '//div[@class="list__content icon icon--calendar"]/dd/text()' ).extract() start_date = ''.join(start_date) print(5, start_date) overview = response.xpath( '//div[@class="content-type content-type--main"]/div[@class="content-type__container"]/div[@class="editor"]//text()' ).extract() overview = ''.join(overview) print(6, overview) mode = response.xpath( '//div[@class="list__content icon icon--clock"]//text()').extract( ) mode = ''.join(mode).replace('\r\n', '') mode = mode.replace(' ', '') print(7, mode) duration = response.xpath( '//div[@class="list__content icon icon--clock"]//text()').extract( ) duration = ''.join(duration).replace('\r\n', '') duration = duration.replace(' ', '') print(8, duration) modules_lists = response.xpath( '//div[@class="container"]//text()').extract() modules_str = ''.join(modules_lists).replace('\r\n', '') # modules = modules.replace('\n','') if "Modules" in modules_str: mstart = modules_str.find("Modules") mend = modules_str.find("How you'll be assessed") modules = modules_str[mstart:mend] # modules = ''.join(modules).replace('\r\n', '') # modules = modules.replace('\n', '') item["modules"] = modules else: modules = 'NULL' print(9, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="content-type__container"]/div[@class="editor"]/p/span/text()' ).extract() assessment = ''.join(assessment) print(10, assessment) career_lists = response.xpath( '//div[@class="container"]//text()').extract() career_str = ''.join(career_lists).replace('\r\n', '') if "Your personal and professional development" in career_str: cstart = career_str.find( "Your personal and professional development") cend = career_str.find("Fees and funding") career = career_str[cstart:cend] # career = ''.join(career).replace('\r\n', '') item["career"] = career else: career = 'NULL' print(11, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = response.xpath( '//div[@class="list__content icon icon--money"]//text()').extract( ) tuition_fee = ''.join(tuition_fee).replace('\r\n', '') tuition_fee = tuition_fee.replace(' ', '') print(12, tuition_fee) location = response.xpath( '//dd[@class="list__item list__item--definition"]/a/text()' ).extract() location = ''.join(location) print(13, location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS).replace('\r\n','') # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(14, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements_lists = response.xpath( '//div[@class="container"]//text()').extract() entry_requirements_str = ''.join(entry_requirements_lists).replace( '\r\n', '') # EntryRequirements = EntryRequirements.replace(' ','') if "Entry requirements" in entry_requirements_str: erstart = entry_requirements_str.find( "Who should study this programme?") erend = entry_requirements_str.find( "English Language requirements") entry_requirements = entry_requirements_str[erstart:erend] item["entry_requirements"] = entry_requirements # print('===========================') else: entry_requirements = 'NULL' print(15, entry_requirements, '==================================') chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'BATH UNIVERSITY' print(2, university) department = 'NULL' # department = ''.join(department) # print(3,department) country = 'UK' city = 'NULL' website = 'http://www.bath.ac.uk' programme = response.xpath( '//h1[@class="page-heading text-center reverse"]/text()').extract( ) programme = ''.join(programme) print(3, programme) ucas_code = response.xpath( '//div[@class="sidebar"]/dl/dd/text()').extract() ucas_code = ''.join(ucas_code).replace('\n', '') print(4, ucas_code) degree_level = "1" degree_type = response.xpath( '//h1[@class="page-heading text-center reverse"]/span/text()' ).extract() degree_type = ''.join(degree_type).replace('\n', '') print(5, degree_type) start_date = response.xpath( '//h2[@class="page-subheading text-center reverse"]/text()' ).extract() start_date = ''.join(start_date) print(6, start_date) degree_description = 'NULL' overview = response.xpath( '//div[@class="column medium-15 end"]/p/text()').extract() overview = ''.join(overview).replace('\n', '') print(7, overview) mode = response.xpath( '//h2[@class="page-subheading text-center reverse"]/text()' ).extract() mode = ''.join(mode).replace('\n', '') print(8, mode) duration = response.xpath( '//h2[@class="page-subheading text-center reverse"]/text()' ).extract() duration = ''.join(duration).replace('\n', '') # duration = duration.replace(' ','') print(9, duration) modules = response.xpath( '//div[@class="row medium-up-2"]//text()').extract() modules = ''.join(modules).replace('\n', '') # modules = modules.replace('\n','') print(10, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="column medium-12"]//text()').extract() assessment = ''.join(assessment).replace('\n', '') print(11, assessment) career = response.xpath( '//div[@class="column medium-15 end"]/p/text()').extract() career = ''.join(career).replace('\n', '') print(12, career) application_date = 'NULL' deadline = response.xpath( '//div[@class="aside style-dark-red key-information"]/ul/li/p/text()' ).extract() deadline = ''.join(deadline) print(13, deadline) application_fee = 'NULL' tuition_fee = response.xpath( '//div[@class="aside style-dark-red key-information"]/ul/li[3]/p[2]/text()' ).extract() tuition_fee = ''.join(tuition_fee).replace('\r\n', '') # tuition_fee = tuition_fee.replace(' ','') tuition_fee = self.getTuition_fee(tuition_fee) try: if tuition_fee > 0: tuition_fee = tuition_fee else: tuition_fee = 'NULL' except: tuition_fee = 'NULL' print(14, tuition_fee) location = response.xpath( '//div[@class="sidebar"]/dl/dd/a/text()').extract() location = ''.join(location) print(15, location) GPA = 'NULL' ATAS = 'NULL' average_score = 'NULL' # average_score = ''.join(average_score) # print(16,average_score) accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_str = response.xpath( '//div[@class="sidebar seperator reverse"]//text()').extract() IELTS_str = ''.join(IELTS_str) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) if "English language requirements" in IELTS_str: Istart = IELTS_str.find("IELTS:") Iend = IELTS_str.find( "The Pearson Test of English Academic (PTE Academic):") IELTS = IELTS_str[Istart:Iend] item["IELTS"] = IELTS else: IELTS = 'NULL' print(17, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL_str = response.xpath( '//div[@class="sidebar seperator reverse"]//text()').extract() TOEFL_str = ''.join(TOEFL_str) if "English language requirements" in TOEFL_str: Istart = TOEFL_str.find("TOEFL IBT:") # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):") TOEFL = TOEFL_str[Istart:] TOEFL = TOEFL[:100] item["TOEFL"] = TOEFL else: TOEFL = 'NULL' print(18, TOEFL) TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//div[@class="column medium-15 end"]/p//text()').extract() entry_requirements = ''.join(entry_requirements) print(19, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Swansea University Prifysgol Abertawe' print(2, university) department = 'NULL' country = 'UK' city = "NULL" website = 'NULL' programme = response.xpath( '//div[@id="contentHeader"]/h1/text()').extract() programme = ''.join(programme) print(3, programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath( '//div[@id="contentHeader"]/h1/text()').extract() degree_type = ''.join(degree_type) degree_type = self.getDegree_type(degree_type) print(4, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(5,start_date) overview = response.xpath( '//div[@id="description-contents"]/p/text()').extract() overview = ''.join(overview).replace('\n', '') print(6, overview) mode_s = response.xpath( '//div[@id="tuition-fees-contents"]//text()').extract() mode_s = ''.join(mode_s).replace('\r\n', '') mode_s = mode_s.replace('\n', '') mode_s = mode_s.replace(' ', '') try: if "Full-time" in mode_s: mode = "Full-time" else: mode = "Part-time" except: mode = "报错!" print(7, mode) duration = response.xpath( '//div[@id="content-items"]/div/div/ol/li//text()').extract() duration = ''.join(duration).replace('\r\n', '') duration = duration.replace('\n', '') duration = duration.replace(' ', '') print(8, duration) modules = response.xpath( '//div[@id="modules"]/div[@id="modules-contents"]//text()' ).extract() modules = ''.join(modules).replace('\r\n', '') # modules = modules.replace('\n','') print(9, modules) teaching = 'NULL' assessment = response.xpath( '//*[@id="teaching-assessment"]//text()').extract() assessment = ''.join(assessment) print(10, assessment) career = 'NULL' # career = ''.join(career).replace('\n', '') # print(11, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee_s = response.xpath( '//div[@id="tuition-fees-contents"]//text()').extract() tuition_fee_s = ''.join(tuition_fee_s).replace('\r\n', '') tuition_fee_s = tuition_fee_s.replace('\n', '') tuition_fee_s = tuition_fee_s.replace(' ', '') tuition_fee_s = self.getTuition_fee(tuition_fee_s) try: if tuition_fee_s > 0: tuition_fee = tuition_fee_s else: tuition_fee = 'NULL' except: tuition_fee = '报错!' print(11, tuition_fee) location = 'NULL' # location = ''.join(location) # print(13,location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_s = response.xpath( '//div[@id="entry-requirements-contents"]//text()').extract() IELTS_s = ''.join(IELTS_s).replace('\r\n', '') # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) try: if "IELTS" in IELTS_s: start = IELTS_s.find("IELTS") IELTS = IELTS_s[start:] IELTS = IELTS[:100] else: IELTS = 'NULL' except: IELTS = '报错!' print(12, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath( '//div[@id="how-to-apply"]//text()').extract() how_to_apply = ''.join(how_to_apply).replace('\n', '') print(13, how_to_apply) entry_requirements = response.xpath( '//div[@id="entry-requirements-contents"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') # EntryRequirements = EntryRequirements.replace(' ','') print(14, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem()
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'BATH UNIVERSITY' print(2, university) department = 'NULL' # department = ''.join(department) # print(3,department) programme = response.xpath( '//h1[@class="page-title"]/text()|//*[@id="content"]/div/div/h2//text()' ).extract() programme = ''.join(programme) print(3, programme) country = 'UK' city = 'NULL' website = 'http://www.bath.ac.uk' ucas_code = response.xpath( '//div[@class="sidebar"]/dl/dd/text()').extract() ucas_code = ''.join(ucas_code).replace('\n', '') print(4, ucas_code) degree_level = '1' degree_type = response.xpath( '//h1[@class="page-title"]/text()').extract() degree_type = ''.join(degree_type).replace('\n', '') print(5, degree_type) start_date_str = response.xpath( '//div[@class="columns small-12 medium-4 content-aside"]//text()' ).extract() start_date_str = ''.join(start_date_str).replace('\n', '') start_date_str = start_date_str.replace(' ', '') try: if "Course start date" in start_date_str: sdstart = start_date_str.find('Course start date') sdend = start_date_str.find('Fees') start_date = start_date_str[sdstart:sdend] item["start_date"] = start_date else: start_date = 'NULL' except: start_date = 'NULL' print(6, start_date) degree_description = 'NULL' overview = response.xpath( '//div[@class="columns small-12 medium-7 content-area"]/p/text()' ).extract() overview = ''.join(overview).replace('\n', '') print(7, overview) mode_str = response.xpath( '//div[@class="columns small-12 medium-4 content-aside"]//text()' ).extract() mode_str = ''.join(mode_str).replace('\n', '') mode_str = mode_str.replace(' ', '') try: if "Mode of attendance" in mode_str: mstart = mode_str.find("Mode of attendance") mend = mode_str.find("Events") mode = mode_str[mstart:mend] item["mode"] = mode else: mode = 'NULL' except: mode = 'NULL' print(8, mode) types = 'NULL' # types = ''.join(types).replace('\n', '') # print(8,types) duration_str = response.xpath( '//div[@class="columns small-12 medium-4 content-aside"]//text()' ).extract() duration_str = ''.join(duration_str).replace('\n', '') duration_str = duration_str.replace(' ', '') try: if "Length of course" in duration_str: dstart = duration_str.find("Length of course") dend = duration_str.find("Mode of attendance") duration = duration_str[dstart:dend] item["duration"] = duration else: duration = "NULL" except: duration = "NULL" print(9, duration) modules = response.xpath( '//div[@class="columns small-12 medium-7"]//text()').extract() modules = ''.join(modules).replace('\n', '') # modules = modules.replace('\n','') print(10, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="columns small-12 medium-7 content-area"]//text()' ).extract() assessment = ''.join(assessment).replace('\n', '') print(11, assessment) career = response.xpath( '//div[@class="columns small-12 medium-7 content-area"]//text()' ).extract() career = ''.join(career).replace('\n', '') print(12, career) application_date = 'NULL' deadline_str = response.xpath( '//div[@class="columns small-12 medium-4 content-aside"]//text()' ).extract() deadline_str = ''.join(deadline_str).replace('\n', '') deadline_str = deadline_str.replace(' ', '') try: if "Application deadline" in deadline_str: dstart = deadline_str.find("Application deadline") dend = deadline_str.find( "Please note applications may close earlier") deadline = deadline_str[dstart:dend] item["deadline"] = deadline else: deadline = "NULL" except: deadline = "NULL" print(13, deadline) application_fee = 'NULL' tuition_fee_str = response.xpath( '//div[@class="columns small-12 medium-4 content-aside"]//text()' ).extract() tuition_fee_str = ''.join(tuition_fee_str).replace('\n', '') tuition_fee_str = tuition_fee_str.replace(' ', '') try: if "Fees" in tuition_fee_str: start = tuition_fee_str.find("Fees") end = tuition_fee_str.find("Entry requirements") tuition_fee = tuition_fee_str[start:end] item["tuition_fee"] = tuition_fee else: tuition_fee = "NULL" except: tuition_fee = "NULL" print(14, tuition_fee) location = response.xpath( '//div[@class="sidebar"]/dl/dd/a/text()').extract() location = ''.join(location).replace('\n', '') print(15, location) ATAS = 'NULL' GPA = 'NULL' average_score = response.xpath( '//div[@class="column medium-15 end"]/p/text()').extract() average_score = ''.join(average_score) print(16, average_score) accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_str = response.xpath( '//*[@id="content"]/section/div/div/section/div/div/p/text()' ).extract() IELTS_str = ''.join(IELTS_str) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) if "English language" in IELTS_str: Istart = IELTS_str.find("IELTS") # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):") IELTS = IELTS_str[Istart:] IELTS = IELTS[:150] item["IELTS"] = IELTS else: IELTS = 'NULL' print(17, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = response.xpath( '//div[@class="sidebar seperator reverse"]/ul/li/text()').extract( ) TOEFL = ''.join(TOEFL) # if "English language requirements" in TOEFL_str: # Istart = TOEFL_str.find("TOEFL IBT:") # # Iend = IELTS_str.find("The Pearson Test of English Academic (PTE Academic):") # TOEFL = TOEFL_str[Istart:] # TOEFL = TOEFL[:100] # item["TOEFL"] = TOEFL # else: # TOEFL = 'NULL' print(18, TOEFL) TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//section[@class="entry-requirement"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') print(19, entry_requirements) chinese_requirements = '' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Edge Hill University' print(2, university) country = 'UK' city = 'NULL' website = 'https://www.edgehill.ac.uk' department = response.xpath( '//*[@id="overview"]//tr//text()').extract()[6:8] department = ' '.join(department) print(3, department) programme = response.xpath( '//*[@id="primary"]/header/h1//text()').extract() # programme = response.xpath('//section[@class="pageHead"]/h1/text()').extract() programme = ''.join(programme) print(4, programme) ucas_code_s = response.xpath('//*[@id="overview"]//text()').extract() ucas_code_s = ''.join(ucas_code_s) try: if " Code:" in ucas_code_s: start = ucas_code_s.find("Code:") end = ucas_code_s.find("Course Length:") ucas_code = ucas_code_s[start:end] item["ucas_code"] = ucas_code else: ucas_code = 'NULL' except: ucas_code = '报错!' print(5, ucas_code) degree_level = '1' degree_type = response.xpath( '//*[@id="primary"]/header/h1//text()').extract() # degree_type = response.xpath('//section[@class="pageHead"]/h1/text()').extract() degree_type = ''.join(degree_type) try: if "BSc" in degree_type: degree_type = 'Bsc' elif "BA" in degree_type: degree_type = 'BA' elif "MNSW" in degree_type: degree_type = 'MNSW' elif "PGCert" in degree_type: degree_type = 'PGCert' elif "MBA" in degree_type: degree_type = 'MBA' elif "MA" in degree_type: degree_type = 'MA' elif "MComp" in degree_type: degree_type = 'MComp' elif "PhD" in degree_type: degree_type = 'PhD' elif "FdA" in degree_type: degree_type = 'FdA' elif "PGCE" in degree_type: degree_type = 'PGCE' elif "IFP" in degree_type: degree_type = 'IFP' elif "LLB" in degree_type: degree_type = 'LLB' elif "MHealth Res" in degree_type: degree_type = 'MHealth Res' elif "MRes" in degree_type: degree_type = 'MRes' elif "MMed" in degree_type: degree_type = 'MMed' elif "MSci" in degree_type: degree_type = 'MSci' elif "MCh" in degree_type: degree_type = 'MCh' else: degree_type = 'Ordinary degree' except: degree_type = "N/A" print(5, degree_type) start_date_s = response.xpath('//*[@id="overview"]//text()').extract() start_date_s = ''.join(start_date_s) try: if "Start Dates:" in start_date_s: start = start_date_s.find("Start Dates:") end = start_date_s.find("Department:") start_date = start_date_s[start:end] item["start_date"] = start_date else: start_date = 'NULL' except: start_date = '报错!' print(6, start_date) degree_description = 'NULL' overview = response.xpath('//*[@id="overview"]//text()').extract() # overview = response.xpath('//div[@class="body-copy"]/ul/li/text()').extract() overview = ''.join(overview) print(7, overview) mode_s = response.xpath('//*[@id="overview"]//tr//text()').extract() mode_s = ''.join(mode_s) # mode = mode.replace('\n','') # mode = mode.replace(' ','') try: if "Full-Time" in mode_s: mode = "Full-Time" else: mode = "Part-Time" except: mode = "报错!" print(8, mode) duration_s = response.xpath('//*[@id="overview"]//text()').extract() duration_s = ''.join(duration_s) # duration = duration.replace('\n','') # duration = duration.replace(' ','') try: if "Length:" in duration_s: start = duration_s.find("Length:") end = duration_s.find("Start Dates:") duration = duration_s[start:end] item["duration"] = duration else: duration = "NULL" except: duration = "报错!" print(9, duration) modules = response.xpath('//*[@id="modules"]//text()').extract() modules = ''.join(modules) # modules = modules.replace('\n','') print(10, modules) teaching = 'NULL' assessment = response.xpath( '//*[@id="course-in-depth"]//text()').extract() assessment = ''.join(assessment) print(11, assessment) career = response.xpath( '//*[@id="careers-and-employability"]//text()').extract() career = ''.join(career) print(12, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' # tuition_fee= response.xpath('//*[@id="finance"]//text()').extract()[3:9] tuition_fee = '£12,750' # tuition_fee = ''.join(tuition_fee) # tuition_fee = tuition_fee.replace('\n','') # tuition_fee = tuition_fee.replace(' ','') print(13, tuition_fee) location = response.xpath('//*[@id="overview"]//tr//text()').extract() location = ''.join(location) print(14, location) ATAS = 'NULL' GPA = 'NULL' MCAT = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS) # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(10, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath( '//*[@id="next-steps"]//text()').extract() how_to_apply = ''.join(how_to_apply) print(15, how_to_apply) entry_requirements = response.xpath( '//*[@id="entry-criteria"]//text()').extract() entry_requirements = ''.join(entry_requirements) # EntryRequirements = EntryRequirements.replace(' ','') print(16, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'University of Worcester' print(2, university) department = 'NULL' country = 'UK' city = "NULL" website = 'NULL' degree_level = '1' # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() programme = response.xpath( '//section[@class="pageHead"]//text()').extract() programme = ''.join(programme) print(3, programme) ucas_code = 'NULL' # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() degree_type = response.xpath( '//section[@class="pageHead"]/h1/text()').extract() degree_type = ''.join(degree_type) degree_type = self.getDegree_type(degree_type) print(4, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(5,start_date) # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract() overview = response.xpath( '//div[@class="body-copy"]/p//text()').extract() overview = ''.join(overview) print(5, overview) mode = 'NULL' # mode = ''.join(mode).replace('\r\n','') # mode = mode.replace('\n','') # mode = mode.replace(' ','') # print(7,mode) duration = 'NULL' # duration = ''.join(duration).replace('\r\n','') # duration = duration.replace('\n','') # duration = duration.replace(' ','') # print(8,duration) modules_s = response.xpath( '//div[@class="columns__column"]//text()').extract() modules_s = ''.join(modules_s) # modules = modules.replace('\n','') try: if "Modules" in modules_s: start = modules_s.find("Modules") end = modules_s.find("Assessment") modules = modules_s[start:end] item["modules"] = modules else: modules = modules_s except: modules = modules_s print(6, modules) teaching = 'NULL' assessment_s = response.xpath( '//div[@class="columns__column"]//text()').extract() assessment_s = ''.join(assessment_s) try: if "Assessment" in assessment_s: start = assessment_s.find("Assessment") assessment = assessment_s[start:] item["assessment"] = assessment else: assessment = assessment_s except: assessment = assessment_s print(7, assessment) career = response.xpath( '//dd[@class="accordion__content rte"]//text()').extract() career = ''.join(career) print(8, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee_s = response.xpath( '//div[@class="columns"]//text()').extract() tuition_fee_s = ''.join(tuition_fee_s) # tuition_fee = tuition_fee.replace('\n','') # tuition_fee = tuition_fee.replace(' ','') tuition_fee_s = self.getTuition_fee(tuition_fee_s) try: if tuition_fee_s > 0: tuition_fee = tuition_fee_s else: tuition_fee = "NULL" except: tuition_fee = "报错!" print(9, tuition_fee) location = 'worcester' # location = ''.join(location) # print(13,location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_s = response.xpath( '//div[@class="right equal-height"]//text()').extract() IELTS_s = ''.join(IELTS_s) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) try: if "IELTS" in IELTS_s: start = IELTS_s.find("IELTS") IELTS = IELTS_s[:100] item["IELTS"] = IELTS else: IELTS = "NULL" except: IELTS = "报错!" print(10, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = "NULL" MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply_s = response.xpath( '//dd[@class="accordion__content rte"]//text()').extract() how_to_apply_s = ''.join(how_to_apply_s) try: if "How to Apply" in how_to_apply_s: start = how_to_apply_s.find("How to Apply") end = how_to_apply_s.find("Entry requirements") how_to_apply = how_to_apply_s[start:end] item["how_to_apply"] = how_to_apply else: how_to_apply = how_to_apply_s except: how_to_apply = '报错!' print(11, how_to_apply) entry_requirements_s = response.xpath( '//dd[@class="accordion__content rte"]//text()').extract() entry_requirements_s = ''.join(entry_requirements_s) # EntryRequirements = EntryRequirements.replace(' ','') try: if "Entry requirements" in entry_requirements_s: start = entry_requirements_s.find("Entry requirements") end = entry_requirements_s.find("Study options") entry_requirements = entry_requirements_s[start:end] item["entry_requirements"] = entry_requirements else: entry_requirements = entry_requirements_s except: entry_requirements = '报错!' print(12, entry_requirements) chinese_requirements = "NULL" school_test = 'NULL' degree_description = "NULL" SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'NORWICH UNIVERSITY OF THE ARTS' print(2, university) department = 'NULL' country = 'UK' city = 'NULL' website = 'NULL' programme = response.xpath('').extract() programme = ''.join(programme) print(3, programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath('').extract() degree_type = ''.join(degree_type) print(4, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(5,start_date) overview = response.xpath('').extract() overview = ''.join(overview).replace('\n', '') print(6, overview) mode = response.xpath('').extract() mode = ''.join(mode).replace('\r\n', '') mode = mode.replace('\n', '') mode = mode.replace(' ', '') print(7, mode) duration = response.xpath('').extract() duration = ''.join(duration).replace('\r\n', '') duration = duration.replace('\n', '') duration = duration.replace(' ', '') print(8, duration) modules = response.xpath('').extract() modules = ''.join(modules).replace('\r\n', '') # modules = modules.replace('\n','') print(9, modules) teaching = 'NULL' assessment = response.xpath('').extract() assessment = ''.join(assessment) print(10, assessment) career = 'NULL' # career = ''.join(career).replace('\n', '') # print(11, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = response.xpath('').extract() tuition_fee = ''.join(tuition_fee).replace('\r\n', '') tuition_fee = tuition_fee.replace('\n', '') tuition_fee = tuition_fee.replace(' ', '') print(11, tuition_fee) location = 'NULL' # location = ''.join(location) # print(13,location) GPA = 'NULL' ATAS = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'UNLL' # IELTS = ''.join(IELTS).replace('\r\n','') # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(12, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' chinese_requirements = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath('').extract() how_to_apply = ''.join(how_to_apply).replace('\n', '') print(13, how_to_apply) entry_requirements = response.xpath('').extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') # EntryRequirements = EntryRequirements.replace(' ','') print(14, entry_requirements) school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'University of Wales, Trinity St David' print(2, university) country = 'UK' city = 'NULL' website = 'http://www.uwtsd.ac.uk' department = response.xpath( '/html/body/div/div/div/div/div/div/p/a/strong//text()').extract() department = ''.join(department) print(3, department) # programme = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() programme = response.xpath( '//h1[@class="t4-course-title"]/text()').extract() programme = ''.join(programme) print(4, programme) ucas_code = response.xpath( '//div[@class="span3"]/p//text()').extract()[:7] ucas_code = ''.join(ucas_code) print(5, ucas_code) degree_level = '0' # degree_type = response.xpath('//div[@class="section picture-nav"]/h1/text()').extract() degree_type = response.xpath( '//h1[@class="t4-course-title"]/text()').extract() degree_type = ''.join(degree_type) print(6, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(7,start_date) degree_description = 'NULL' # overview = response.xpath('//div[@class="left logo-bg"]//text()').extract() overview = response.xpath('//div[@class="span6"]//text()').extract() overview = ''.join(overview) print(8, overview) mode_s = response.xpath('//div[@class="span3"]/p//text()').extract() mode_s = ''.join(mode_s) try: if "Full Time" in mode_s: mode = "Full Time" else: mode = "Full Time" except: mode = "Part Time" # mode = mode.replace('\n','') # mode = mode.replace(' ','') print(9, mode) duration = response.xpath( '//div[@class="span3"]/p//text()').extract()[1:30] duration = ''.join(duration) # duration = duration.replace('\n','') # duration = duration.replace(' ','' print(10, duration) modules = response.xpath( '//*[@id="collapseModules"]//text()').extract() modules = ''.join(modules) # modules = modules.replace('\n','') print(11, modules) teaching = 'NULL' assessment = response.xpath( '//*[@id="collapseAssessment"]//text()').extract() assessment = ''.join(assessment) print(12, assessment) career = response.xpath( '//*[@id="collapseCareerOpportunities"]//text()').extract() career = ''.join(career) print(13, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' # tuition_fee = ''.join(tuition_fee) # # tuition_fee = tuition_fee.replace('\n','') # # tuition_fee = tuition_fee.replace(' ','') # print(9, tuition_fee) location = response.xpath('//div[@class="span3"]/p//text()').extract() location = ''.join(location) print(14, location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS) # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(10, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' # how_to_apply = ''.join(how_to_apply) # print(11,how_to_apply) entry_requirements = response.xpath( '//*[@id="collapseEntryCriteria"]//text()').extract() entry_requirements = ''.join(entry_requirements) # EntryRequirements = EntryRequirements.replace(' ','') print(15, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'University of Chester' print(2, university) country = 'UK' city = 'NULL' website = 'https://www1.chester.ac.uk' department = 'NULL' programme = response.xpath( '//h1[@id="main-content"]//text()').extract() programme = ''.join(programme) print(3, programme) degree_type = response.xpath( '//h1[@id="main-content"]/div/text()').extract() degree_type = ''.join(degree_type) print(4, degree_type) ucas_code = 'NULL' degree_level = '1' start_date = response.xpath( '//div[@class="startdate m-facts__item"]//text()').extract() start_date = ''.join(start_date) print(5, start_date) degree_description = 'NULL' overview = response.xpath( '//div[@class="courseajax_overview"]//text()').extract() overview = ''.join(overview).replace('\r\n', '') print(6, overview) mode = response.xpath( '//div[@class="mode m-facts__item"]//text()').extract() mode = ''.join(mode).replace('\r\n', '') # mode = mode.replace(' ','') print(7, mode) duration = response.xpath( '//div[@class="courseajax_duration m-facts__item"]//text()' ).extract() duration = ''.join(duration).replace('\r\n', '') # duration = duration.replace(' ','') print(8, duration) modules = response.xpath('//*[@id="learning"]//text()').extract() modules = ''.join(modules).replace('\r\n', '') modules = modules.replace('\n', '').replace('\t', '') print(9, modules) teaching = 'NULL' assessment = response.xpath( '//div[@class="large-7 columns float-right m-sections__learning-section"]//text()' ).extract() assessment = ''.join(assessment).replace('\r\n', '') print(10, assessment) career = 'NULL' # career = ''.join(career).replace('\r\n', '') # if "Your personal and professional development" in career_str: # cstart = career_str.find("Your personal and professional development") # cend = career_str.find("Fees and funding") # career = career_str[cstart:cend] # # career = ''.join(career).replace('\r\n', '') # item["career"] = career # else: # career = '' # print(11,career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = response.xpath( '//div[@class="field-fees-international"]/p/text()').extract() tuition_fee = ''.join(tuition_fee).replace('\n', '') # tuition_fee = tuition_fee.replace(' ','') tuition_fee = self.getTuition_fee(tuition_fee) try: if tuition_fee > 0: tuition_fee = tuition_fee else: tuition_fee = 'NULL' except: tuition_fee = 'NULL' print(11, tuition_fee) location = response.xpath( '//div[@id="edit-compulsory"]//text()').extract() location = ''.join(location) print(12, location) GPA = 'NULL' ATAS = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_lists = response.xpath( '//div[@class="courseajax_entryrequirementsint"]//text()').extract( ) IELTS_str = ''.join(IELTS_lists).replace('\r\n', '') # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) if "English Language Requirements" in IELTS_str: Istart = IELTS_str.find("IELTS Academic:") Iend = IELTS_str.find("Select your country") IELTS = IELTS_str[Istart:Iend] IELTS = IELTS[:120] item["IELTS"] = IELTS else: IELTS = 'NULL' print(13, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//div[@class="courseajax_entryrequirements"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\r\n', '') # EntryRequirements = EntryRequirements.replace(' ','') # if "Entry requirements" in entry_requirements_str: # erstart = entry_requirements_str.find("Who should study this programme?") # erend = entry_requirements_str.find("English Language requirements") # entry_requirements = entry_requirements_str[erstart:erend] # # item["entry_requirements"] = entry_requirements # # print('===========================') # else: # entry_requirements = '' print(14, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self,response): print('==================================',response.url) item = HooliItem() url = response.url print(1,url) university = 'Loughborough University' print(2,university) department = response.xpath('//dd[@class="list__item--definition"]/text()').extract() department = ''.join(department) print(3,department) country = 'UK' city = 'NULL' website = 'http://www.lboro.ac.uk' degree_level = '1' programme = response.xpath('//h1[@id="top"]//text()').extract() programme = ''.join(programme) print(4,programme) ucas_code = 'NULL' # Master = ''.join(Master) degree_type = response.xpath('//h1[@id="top"]/span/text()').extract() degree_type = ''.join(degree_type) print(5,degree_type) start_date_str = response.xpath('//div[@class="list__content icon icon--calendar"]//text()').extract() start_date_str = ''.join(start_date_str) try: if "Start date:" in start_date_str: start = start_date_str.find("Start date:") end = start_date_str.find("Application deadline:") start_date = start_date_str[start:end] item["start_date"] = start_date else: start_date = "NULL" except: start_date = "报错" print(6,start_date) overview = response.xpath('//div[@class="content-type content-type--main"]//text()').extract() overview = ''.join(overview) print(7, overview) mode = response.xpath('//div[@class="list__content icon icon--clock"]//text()').extract() mode = ''.join(mode) print(8,mode) duration = response.xpath('//div[@class="list__content icon icon--clock"]//text()').extract() duration = ''.join(duration) # Duration = Duration.replace(' ','') print(9,duration) modules = 'NULL' # modules = ''.join(modules).replace('\n','') # modules = modules.replace('\n','') # print(8,modules) teaching = 'NULL' assessment = 'NULL' # teaching_assessment = ''.join(teaching_assessment).replace('\n','') # print(9, teaching_assessment) career = 'NULL' # career = ''.join(career).replace('\n', '') # print(10, career) application_date = 'NULL' deadline_str = response.xpath('//div[@class="list__content icon icon--calendar"]//text()').extract() deadline_str = ''.join(deadline_str) try: if "Application deadline:" in deadline_str: start = deadline_str.find("Application deadline:") deadline = deadline_str[start:] item["deadline"] = deadline else: deadline = "NULL" except: deadline = "报错!" print(10,deadline) application_fee = 'NULL' tuition_fee= response.xpath('//div[@class="list__content icon icon--money"]//text()').extract() tuition_fee = ''.join(tuition_fee) # tuition_fee = tuition_fee.replace(' ','') print(11,tuition_fee) location_str = response.xpath('//dl[@class="list list--definition list--pg-programme"]//text()').extract() location_str = ''.join(location_str).replace('\r\n','') location_str = location_str.replace(' ','') try: if "Location:" in location_str: start = location_str.find("Location:") end = location_str.find("Application deadline:") location = location_str[start:end] item["location"] = location else: location = "NULL" except: location = "报错!" print(12,location) ATAS = 'NULL' GPA = 'NULL' accredited_university = 'NULL' IELTS = 'NULL' # IELTS = ''.join(IELTS).replace('\n','') # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(11, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT= 'NULL' MCAT= 'NULL' average_score = 'NULL' Alevel = 'NULL' IB = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath('//div[@class="editor"]//text()').extract() how_to_apply = ''.join(how_to_apply) print(13,how_to_apply) entry_requirements = response.xpath('//div[@class="editor"]//text()').extract() entry_requirements = ''.join(entry_requirements) # EntryRequirements = EntryRequirements.replace(' ','') print(14,entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'Royal Holloway University of LondonEgham' print(2, university) department = response.xpath( '//div[@id="details"]/table/tbody/tr[4]/td/a/div/text()').extract( ) department = ''.join(department) print(3, department) country = 'UK' city = 'NULL' website = 'https://www.royalholloway.ac.uk' programme = response.xpath( '//div[@class="sys_large-col"]/h1/text()').extract() programme = ''.join(programme) print(4, programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath( '//div[@class="sys_large-col"]/h1/text()').extract() degree_type = ''.join(degree_type) try: if "BSc" in degree_type: degree_type = 'Bsc' elif "MSc" in degree_type: degree_type = "MSc" elif "BA" in degree_type: degree_type = 'BA' elif "MNSW" in degree_type: degree_type = 'MNSW' elif "PGCert" in degree_type: degree_type = 'PGCert' elif "MBA" in degree_type: degree_type = 'MBA' elif "MA" in degree_type: degree_type = 'MA' elif "MComp" in degree_type: degree_type = 'MComp' elif "PhD" in degree_type: degree_type = 'PhD' elif "FdA" in degree_type: degree_type = 'FdA' elif "PGCE" in degree_type: degree_type = 'PGCE' elif "IFP" in degree_type: degree_type = 'IFP' elif "LLB" in degree_type: degree_type = 'LLB' elif "MHealth Res" in degree_type: degree_type = 'MHealth Res' elif "MRes" in degree_type: degree_type = 'MRes' elif "MMed" in degree_type: degree_type = 'MMed' elif "MSci" in degree_type: degree_type = 'MSci' elif "MCh" in degree_type: degree_type = 'MCh' elif "LLM" in degree_type: degree_type = "LLM" elif "Y2QF" in degree_type: degree_type = "Y2QF" elif "Y2QG" in degree_type: degree_type = "Y2QG" else: degree_type = 'Ordinary degree' except: degree_type = "NULL" print(5, degree_type) start_date = response.xpath( '//div[@id="details"]/table/tbody/tr[2]/td/div/text()').extract() start_date = ''.join(start_date) print(6, start_date) overview = response.xpath('//div[@id="tab-1"]//text()').extract() overview = ''.join(overview).replace('\n', '') print(7, overview) mode = response.xpath( '//div[@id="details"]/table/tbody/tr[3]/td/div/text()').extract() mode = ''.join(mode) print(8, mode) duration = response.xpath( '//div[@id="details"]/table/tbody/tr[3]/td/div/text()').extract() duration = ''.join(duration) print(9, duration) modules = 'NULL' teaching = 'NULL' assessment = response.xpath('//div[@id="tab-3"]/p/text()').extract() assessment = ''.join(assessment) print(10, assessment) career = response.xpath('//div[@id="tab-5"]//text()').extract() career = ''.join(career).replace('\n', '') print(11, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee_s = response.xpath( '//div[@id="tab-6"]/p//text()').extract() tuition_fee_s = ''.join(tuition_fee_s) tuition_fee_s = self.getTuition_fee(tuition_fee_s) try: if tuition_fee_s > 0: tuition_fee = tuition_fee_s else: tuition_fee = 'NULL' except: tuition_fee = '报错!' print(12, tuition_fee) location = response.xpath( '//div[@id="details"]/table/tbody/tr[5]/td/a/div/text()').extract( ) location = ''.join(location) print(13, location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_s = response.xpath('//div[@id="tab-4"]/p/text()').extract() IELTS_s = ''.join(IELTS_s) try: if "IELTS" in IELTS_s: start = IELTS_s.find("IELTS") IELTS = IELTS_s[start:] IELTS = IELTS[:100] item["IELTS"] = IELTS else: IELTS = "NULL" except: IELTS = "报错!" print(14, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath( '//div[@id="tab-4"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\n', '') # entry_requirements = entry_requirements.replace('({})','') print(15, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self,response): print('==================================',response.url) item = HooliItem() url = response.url print(1, url) university = "EXETER UNDERGRADUATE STUDY" print(2, university) department = 'NULL' country = 'UK' city = 'NULL' website = 'https://www.exeter.ac.uk' programme = response.xpath('//div[@id="left-col"]/h1//text()').extract() programme = ''.join(programme) # Course = Course.replace('\r\n', '') print(3,programme) ucas_code = response.xpath('//td[@class="exeter-course-ucascode"]//text()').extract() ucas_code = ''.join(ucas_code) print(4,ucas_code) degree_level = '0' degree_type = response.xpath('//div[@id="left-col"]/h1/text()').extract() degree_type = ''.join(degree_type) print(5,degree_type) start_date = 'NULL' degree_description = 'NULL' overview = response.xpath('//div[@id="Overview"]//text()').extract() overview = ''.join(overview) print(6,overview) mode = 'NULL' duration = response.xpath('//td[@class="exeter-course-duration"]//text()').extract() duration = ''.join(duration) print(7,duration) Alevel = response.xpath('//td[@class="exeter-course-typicaloffer"]//text()').extract() Alevel = ''.join(Alevel) print(8,Alevel) IB = response.xpath('//td[@class="exeter-course-typicaloffer"]//text()').extract() IB = ''.join(IB) print(9,IB) IELTS = 'NULL' IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' location = response.xpath('//td[@class="exeter-course-location"]//text()').extract() location = ''.join(location) print(10,location) ATAS = 'NULL' modules = response.xpath('//div[@class="container"]//text()').extract() modules = ''.join(modules).replace('\n', '') modules = modules.replace('\r', '') modules = modules.replace('\t', '') modules = str(modules) print(11, modules) teaching = 'NULL' assessment = response.xpath('//div[@id="Learning"]//text()').extract() assessment = ''.join(assessment) assessment = assessment.replace('\r\n', '') assessment = assessment.replace('\n', '') assessment = assessment.replace('\r', '') print(12, assessment) career = response.xpath('//div[@id="Careers"]//text()').extract() career = ''.join(career).replace('\r\n', '') career = career.replace('\n', '') print(13, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = 'NULL' entry_requirements = response.xpath('//div[@id="Entry-requirements"]//text()').extract() entry_requirements = ''.join(entry_requirements).replace('\r\n', '') print(14, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = response.xpath('//div[@id="course-synopsis"]//text()').extract() other = ''.join(other).replace('\r\n', '') other = other.replace('\n', '') other = other.replace('\t', '') print(15,other) create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(16, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item
def parse_item(self,response): print('==================================',response.url) item = HooliItem() url = response.url print(1,url) university = 'FALMOUTH UNIVERSITY' print(2,university) department = 'NULL' country = 'UK' city = 'NULL' website = 'https://www.falmouth.ac.uk' # programme = response.xpath('//div[@class="title"]/h1/text()').extract() programme = response.xpath('//div[@class="h1-box"]/h1/text()').extract() programme = ''.join(programme) print(3,programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath('//div[@class="h1-box"]/h1/text()').extract() degree_type = ''.join(degree_type) print(4,degree_type) start_date_lists = response.xpath('//div[@class="accordion"]//text()').extract() start_date_str = ''.join(start_date_lists) if "Start dates and application deadlines" in start_date_str: sdstart = start_date_str.find("Start dates and application deadlines") sdend = start_date_str.find("News and Events") start_date = start_date_str[sdstart:sdend] item["start_date"] = start_date else: start_date = 'NULL' print(5,start_date) # overview = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() overview_list = response.xpath('//div[@class="content-block-wrapper"]//text()').extract() overview_str = ''.join(overview_list) if "Benefits" in overview_str: Ostart = overview_str.find("Benefits") Oend = overview_str.find("How the course is taught") overview = overview_str[Ostart:Oend] item["overview"] = overview else: overview = response.xpath('//div[@class="content-block-wrapper"]//text()').extract() overview = ''.join(overview) print(6, overview) mode = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract() mode = ''.join(mode) # mode_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() # mode_str = ''.join(mode_lists) # # mode = mode.replace('\n','') # # mode = mode.replace(' ','') # if "Mode of study:" in mode_str: # mstart = mode_str.find("Mode of study:") # mend = mode_str.find("Summary") # mode = mode_str[mstart:mend] # item["mode"] = mode # else: # mode = '' print(7,mode) types = '' # duration_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() duration = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract() duration = ''.join(duration) # duration_str = ''.join(duration_lists) # # duration = duration.replace('\n','') # # duration = duration.replace(' ','') # if "Mode of study:" in duration_str: # dstart = duration_str.find("Mode of study:") # dend = duration_str.find("Duration:") # duration = duration_str[dstart:dend] # item["duration"] = duration # else: # duration = '' print(8,duration) modules = response.xpath('//div[@class="accordion ui-accordion ui-widget ui-helper-reset"]//text()').extract() modules = ''.join(modules) # modules_lists = response.xpath('//div[@class="accordion"]//text()').extract() # modules_str = ''.join(modules_lists) # if "Course content" in modules_str: # mdstart = modules_str.find("Course content") # mdend = modules_str.find("Assessments") # modules = modules_str[mdstart:mdend] # item["modules"] = modules # else: # modules = '' # modules = modules.replace('\n','') print(9,modules) teaching = 'NULL' assessment = response.xpath('//div[@class="accordion"]//text()').extract() assessment = ''.join(assessment) # teaching_assessment_lists = response.xpath('//div[@class="accordion"]//text()').extract() # teaching_assessment_str = ''.join(teaching_assessment_lists) # if "Assessments" in teaching_assessment_str: # Astart = teaching_assessment_str.find("Assessments") # Aend = teaching_assessment_str.find("How you study") # teaching_assessment = teaching_assessment_str[Astart:Aend] # item["teaching_assessment"] = teaching_assessment # else: # teaching_assessment = '' print(10,assessment) career = response.xpath('//div[@class="field-career-opportunities"]//text()').extract() career = ''.join(career) print(11, career) application_date = 'NULL' deadline_lists = response.xpath('//div[@class="accordion"]//text()').extract() deadline_str = ''.join(deadline_lists) if "Start dates and application deadlines" in deadline_str: dlstart = deadline_str.find("Start dates and application deadlines") dlend = deadline_str.find("News and Events") deadline = deadline_str[dlstart:dlend] item["deadline"] = deadline else: deadline = 'NULL' print(11,deadline) application_fee = 'NULL' tuition_fee= 'NULL' # tuition_fee = ''.join(tuition_fee).replace('\r\n','') # tuition_fee = tuition_fee.replace('\n','') # tuition_fee = tuition_fee.replace(' ','') # print(11, tuition_fee) location = 'NULL' # location = ''.join(location) # print(13,location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_lists = response.xpath('//div[@class="accordion"]//text()').extract() IELTS_str = ''.join(IELTS_lists) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) if "Entry Requirements" in IELTS_str: Istart = IELTS_str.find("Entry Requirements") Iend = IELTS_str.find("Financing your studies") IELTS = IELTS_str[Istart:Iend] item["IELTS"] = IELTS else: IELTS = 'NULL' print(12, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = response.xpath('//div[@class="field-selection-process"]//text()').extract() interview = ''.join(interview) print(13,interview) portfolio = response.xpath('//div[@class="field-selection-process"]//text()').extract() portfolio = ''.join(portfolio) print(14,portfolio) application_documents = 'NULL' how_to_apply_lists = response.xpath('//div[@class="accordion"]//text()').extract() how_to_apply_str = ''.join(how_to_apply_lists) if "How to apply" in how_to_apply_str: hstart = how_to_apply_str.find("How to apply") hend = how_to_apply_str.find("Start dates and application deadlines") how_to_apply = how_to_apply_str[hstart:hend] item["how_to_apply"] = how_to_apply else: how_to_apply = 'NULL' print(13,how_to_apply) entry_requirements = response.xpath('//*[@id="start-of-content"]/div[2]/div[2]/div[1]//text()').extract() entry_requirements = ''.join(entry_requirements) # entry_requirements_lists = response.xpath('//div[@class="accordion"]//text()').extract() # entry_requirements_str = ''.join(entry_requirements_lists) # # EntryRequirements = EntryRequirements.replace(' ','') # if "Entry Requirements" in entry_requirements_str: # Estart = entry_requirements_str.find("Entry Requirements") # Eend = entry_requirements_str.find("Financing your studies") # entry_requirements = entry_requirements_str[Estart:Eend] # item["entry_requirements"] = entry_requirements # else: # entry_requirements = '' print(14,entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item