def getHarvardOnlineCourse(self, subject, url, course_dict=None): if self.need_update_subject(subject) == False: return print "processing " + subject + " url " + url file_name = self.get_file_name(subject, self.school) file_lines = self.countFileLineNum(file_name) count = 0 f = self.open_db(file_name + ".tmp") r = requests.get(url) soup = BeautifulSoup(r.text) course_num = "" title = "" link = "" description = '' print "processing html and write data to file..." if len(soup.find_all("li", class_ = "views-row")) > 0: for li in soup.find_all("li", class_ = "views-row"): if li.span != None: course_num = li.span.span.string else: course_num = li.prettify().split("\n")[1].strip() course_num = course_num.replace(' E-', '').strip() title = li.a.string.strip() link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip() if self.deep_mind: link, description = self.getMoreInfo(link) count = count + 1 print course_num + " " + title + ' ' + link self.write_db(f, course_num, title, link, description) if course_dict != None: if course_num.startswith('CSCI'): course_num = course_num.replace('CSCI', 'CS') course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, link, description)) else: for li in soup.find_all("li"): if li.attrs.has_key("class"): if li.prettify().find("E-") != -1 and str(li.a["href"]).startswith("/courses"): for item in li.prettify().split("\n"): if item.find("E-") != -1: course_num = item.replace(' E-', '').strip() count = count + 1 title = li.a.string.strip() link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip() if self.deep_mind: link, description = self.getMoreInfo(link) print course_num + " " + title + ' ' + link self. write_db(f, course_num, title, link, description) if course_dict != None: if course_num.startswith('E-'): course_num = course_num.replace('CSCI', 'CS') course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, link, description)) self.close_db(f) if file_lines != count and count > 0: self.do_upgrade_db(file_name) print "before lines: " + str(file_lines) + " after update: " + str(count) + " \n\n" else: self.cancel_upgrade(file_name) print "no need upgrade\n"
def getRecordsDict(self): records_dict = {} f = open(self.get_file_name("eecs/" + "cs", self.school), 'rU') for line in f.readlines(): record = CourseRecord(line) records_dict[record.get_id().strip()] = record return records_dict
def processData(self, semester_list, dept, subject): if self.need_update_subject(subject) == False: return file_name = '' if subject == 'eecs': file_name = self.get_file_name(subject + "/cmu/" + self.dept_dict[dept], self.school) else: file_name = self.get_file_name(subject, self.school) file_lines = self.countFileLineNum(file_name) f = self.open_db(file_name + ".tmp") self.count = 0 course_dict = {} for semester in semester_list: print "processing " + self.dept_dict[dept] + " " + semester + " data" param = {"SEMESTER" : semester, "MINI" : "NO", "GRAD_UNDER" : "All", "PRG_LOCATION" : "All", "DEPT" : dept, "LAST_NAME" : "", "FIRST_NAME" : "", "BEG_TIME" : "All", "KEYWORD" : "", "TITLE_ONLY" : "NO", "SUBMIT" : ""} r = requests.post("https://enr-apps.as.cmu.edu/open/SOC/SOCServlet/search", params=param) soup = BeautifulSoup(r.text); for tr in soup.find_all("tr"): if tr.td != None and tr.td.a != None: pos = tr.prettify().find("</td>") title = tr.prettify()[tr.prettify().find("<td>", pos) + 4 : tr.prettify().find("</td>", pos + 3)].replace("\n", "").strip() while title.find("<") != -1: title = title[0 : title.find("<")].strip()+ title[title.find(">") + 1:].replace("\n", "").strip() url = "https://enr-apps.as.cmu.edu" + tr.td.a.prettify()[tr.td.a.prettify().find("/open/SOC/SOCServlet"): tr.td.a.prettify().find("'", tr.td.a.prettify().find("/open/SOC/SOCServlet"))].replace("amp;","") course_num = dept + tr.td.a.text if course_dict.get(course_num, '') != '': continue print course_num + " " + title course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, url, self.getDescription(url))) for k, record in [(k,course_dict[k]) for k in sorted(course_dict.keys())]: self.count += 1 self.write_db(f, k, record.get_title().strip(), record.get_url().strip(), record.get_describe().strip()) self.close_db(f) if file_lines != self.count and self.count > 0: self.do_upgrade_db(file_name) print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n" else: self.cancel_upgrade(file_name) print "no need upgrade\n"
def processStanfordDate(self, f, url, course_dict): print 'processing ' + url r = requests.get(url) soup = BeautifulSoup(r.text) th_set = soup.find_all("th") td_set_all = soup.find_all("td") td_set = [] td_set_2 = [] del th_set[0:5] i = 0 for td in td_set_all: i = i + 1 if i == 1: td_set.append(td.string) if i == 2: td_set_2.append(td.string) if i == 4: i = 0 for index in range(0, len(th_set)): link = th_set[index].prettify() link = link[link.find("http"):link.find("EDU") + 3] if self.isInCourseNumList(th_set[index].string): continue course_id = th_set[index].string.upper() description = '' description += 'instructors:' + td_set_2[index] + ' ' if self.course_name_dict.get(self.formatCourseTitle(td_set[index]), '') != '': if self.course_name_dict.get( self.formatCourseTitle(td_set[index]), '') != '': description += 'videourl:' + self.course_name_dict[ self.formatCourseTitle(td_set[index])] + ' ' if self.description_dict.get(course_id, '') != '': description += 'description:' + self.description_dict[ course_id] + ' ' course_dict[th_set[index].string.upper()] = CourseRecord( self.get_storage_format(th_set[index].string.upper(), td_set[index], link, description))
def load_data(course): print 'generate graph for ' + course result_node_name = '' for data_file in data_file_list: f = open(data_file,'rU') lines = f.readlines() data_file_lines_list.append(lines) node_type = data_file[data_file.find('db/') + 3:data_file.find('/', data_file.find('db/') + 3)] for line in lines: record = CourseRecord(line.strip()) course_dict[record.get_id().strip()] = (record.get_title(), record.get_url().strip()) node_name = record.get_id().strip() + " " + record.get_title().strip() if record.get_id().strip() == course: result_node_name = node_name g.add_node(node_name, name=node_name, type=node_type) for lines in data_file_lines_list: for line in lines: record = CourseRecord(line.strip()) prereq_list = [] if record.get_prereq() != None: #print record.get_prereq() prereq_list = regex.findall(record.get_prereq().lower()) if len(prereq_list) > 0: #prereq = 'prereq:' for p in prereq_list: node_name = record.get_id().strip() + ' ' + record.get_title().strip() if course_dict.get(p, '') == '': g.add_node(p, name=p, type='prep') g.add_edge(node_name, p) else: g.add_edge(node_name, p + ' ' + course_dict.get(p)[0].strip()) #prereq += p + ' ' #print prereq return result_node_name
def load_data(course): print 'generate graph for ' + course result_node_name = '' for data_file in data_file_list: f = open(data_file, 'rU') lines = f.readlines() data_file_lines_list.append(lines) node_type = data_file[data_file.find('db/') + 3:data_file.find('/', data_file.find('db/') + 3)] for line in lines: record = CourseRecord(line.strip()) course_dict[record.get_id().strip()] = (record.get_title(), record.get_url().strip()) node_name = record.get_id().strip() + " " + record.get_title( ).strip() if record.get_id().strip() == course: result_node_name = node_name g.add_node(node_name, name=node_name, type=node_type) for lines in data_file_lines_list: for line in lines: record = CourseRecord(line.strip()) prereq_list = [] if record.get_prereq() != None: #print record.get_prereq() prereq_list = regex.findall(record.get_prereq().lower()) if len(prereq_list) > 0: #prereq = 'prereq:' for p in prereq_list: node_name = record.get_id().strip( ) + ' ' + record.get_title().strip() if course_dict.get(p, '') == '': g.add_node(p, name=p, type='prep') g.add_edge(node_name, p) else: g.add_edge(node_name, p + ' ' + course_dict.get(p)[0].strip()) #prereq += p + ' ' #print prereq return result_node_name