示例#1
0
 def getHarvardOnlineCourse(self, subject, url, course_dict=None):
     if self.need_update_subject(subject) == False:
         return
     print "processing " + subject + " url " + url
     file_name = self.get_file_name(subject, self.school)
 
     file_lines = self.countFileLineNum(file_name)
     count = 0
     f = self.open_db(file_name + ".tmp")
 
     r = requests.get(url)
     soup = BeautifulSoup(r.text)
     course_num = ""
     title = ""
     link = ""
     description = ''
     print "processing html and write data to file..."
     if len(soup.find_all("li", class_ = "views-row")) > 0:
         for li in soup.find_all("li", class_ = "views-row"):
             if li.span != None:
                 course_num = li.span.span.string
             else:
                 course_num = li.prettify().split("\n")[1].strip()
             course_num = course_num.replace(' E-', '').strip()
             title = li.a.string.strip()
             link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
             if self.deep_mind:
                 link, description = self.getMoreInfo(link)
             count = count + 1
             print course_num + " " + title + ' ' + link
             self.write_db(f, course_num, title, link, description)
             if course_dict != None:
                 if course_num.startswith('CSCI'):
                     course_num = course_num.replace('CSCI', 'CS')
                 course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, link, description))
     else:
         for li in soup.find_all("li"):
             if li.attrs.has_key("class"):
                 if li.prettify().find("E-") != -1 and str(li.a["href"]).startswith("/courses"):
                     for item in li.prettify().split("\n"):
                         if item.find("E-") != -1:
                             course_num = item.replace(' E-', '').strip()
                     count = count + 1
                     title = li.a.string.strip()
                     link = "http://www.extension.harvard.edu" + str(li.a["href"]).strip()
                     if self.deep_mind:
                         link, description = self.getMoreInfo(link)
                     print course_num + " " + title + ' ' + link 
                     self. write_db(f, course_num, title, link, description)
                     if course_dict != None:
                         if course_num.startswith('E-'):
                             course_num = course_num.replace('CSCI', 'CS')
                         course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, link, description))
     self.close_db(f)
     if file_lines != count and count > 0:
         self.do_upgrade_db(file_name)
         print "before lines: " + str(file_lines) + " after update: " + str(count) + " \n\n"
     else:
         self.cancel_upgrade(file_name)
         print "no need upgrade\n"
示例#2
0
    def getRecordsDict(self):
        records_dict = {}
        f = open(self.get_file_name("eecs/" + "cs", self.school), 'rU')
        for line in f.readlines():
            record = CourseRecord(line)
            records_dict[record.get_id().strip()] = record

        return records_dict 
示例#3
0
    def processData(self, semester_list, dept, subject):
        if self.need_update_subject(subject) == False:
            return

        file_name = ''
        if subject == 'eecs':
            file_name = self.get_file_name(subject + "/cmu/" + self.dept_dict[dept], self.school)
        else:
            file_name = self.get_file_name(subject, self.school)

        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        course_dict = {}

        for semester in semester_list:
            print "processing " + self.dept_dict[dept] + " " + semester + " data"
            param = {"SEMESTER" : semester,
                 "MINI" : "NO",
                 "GRAD_UNDER" : "All",
                 "PRG_LOCATION" : "All",
                 "DEPT" : dept,
                 "LAST_NAME" : "",
                 "FIRST_NAME" : "",
                 "BEG_TIME" : "All",
                 "KEYWORD" : "",
                 "TITLE_ONLY" : "NO",
                 "SUBMIT" : ""}
            r = requests.post("https://enr-apps.as.cmu.edu/open/SOC/SOCServlet/search", params=param)
            soup = BeautifulSoup(r.text); 

            for tr in soup.find_all("tr"):
                if tr.td != None and tr.td.a != None:
                    pos = tr.prettify().find("</td>")
                    title = tr.prettify()[tr.prettify().find("<td>", pos) + 4 : tr.prettify().find("</td>", pos + 3)].replace("\n", "").strip()
                    while title.find("<") != -1:
                        title = title[0 : title.find("<")].strip()+ title[title.find(">") + 1:].replace("\n", "").strip()
                    url = "https://enr-apps.as.cmu.edu" + tr.td.a.prettify()[tr.td.a.prettify().find("/open/SOC/SOCServlet"): tr.td.a.prettify().find("'", tr.td.a.prettify().find("/open/SOC/SOCServlet"))].replace("amp;","")
                    course_num = dept + tr.td.a.text 
                    if course_dict.get(course_num, '') != '':
                        continue
                    print course_num + " " + title
                    course_dict[course_num] = CourseRecord(self.get_storage_format(course_num, title, url, self.getDescription(url)))

        for k, record in [(k,course_dict[k]) for k in sorted(course_dict.keys())]:
            self.count += 1
            self.write_db(f, k, record.get_title().strip(), record.get_url().strip(), record.get_describe().strip())

        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
示例#4
0
    def processStanfordDate(self, f, url, course_dict):
        print 'processing ' + url
        r = requests.get(url)
        soup = BeautifulSoup(r.text)
        th_set = soup.find_all("th")
        td_set_all = soup.find_all("td")
        td_set = []
        td_set_2 = []
        del th_set[0:5]
        i = 0
        for td in td_set_all:
            i = i + 1
            if i == 1:
                td_set.append(td.string)
            if i == 2:
                td_set_2.append(td.string)
            if i == 4:
                i = 0

        for index in range(0, len(th_set)):
            link = th_set[index].prettify()
            link = link[link.find("http"):link.find("EDU") + 3]
            if self.isInCourseNumList(th_set[index].string):
                continue
            course_id = th_set[index].string.upper()
            description = ''
            description += 'instructors:' + td_set_2[index] + ' '
            if self.course_name_dict.get(self.formatCourseTitle(td_set[index]),
                                         '') != '':
                if self.course_name_dict.get(
                        self.formatCourseTitle(td_set[index]), '') != '':
                    description += 'videourl:' + self.course_name_dict[
                        self.formatCourseTitle(td_set[index])] + ' '

            if self.description_dict.get(course_id, '') != '':
                description += 'description:' + self.description_dict[
                    course_id] + ' '
            course_dict[th_set[index].string.upper()] = CourseRecord(
                self.get_storage_format(th_set[index].string.upper(),
                                        td_set[index], link, description))
示例#5
0
def load_data(course):
    print 'generate graph for ' + course
    result_node_name = ''
    for data_file in data_file_list:
        f = open(data_file,'rU')
        lines = f.readlines()
        data_file_lines_list.append(lines)
        node_type = data_file[data_file.find('db/') + 3:data_file.find('/', data_file.find('db/') + 3)]

        for line in lines:
            record = CourseRecord(line.strip())
            course_dict[record.get_id().strip()] = (record.get_title(), record.get_url().strip())
            node_name = record.get_id().strip() + " " + record.get_title().strip()
            if record.get_id().strip() == course:
                result_node_name = node_name
            g.add_node(node_name, name=node_name, type=node_type)

    for lines in data_file_lines_list:
        for line in lines:
            record = CourseRecord(line.strip())
            prereq_list = []
            if record.get_prereq() != None:
                #print record.get_prereq()
                prereq_list = regex.findall(record.get_prereq().lower())

            if len(prereq_list) > 0:
                #prereq = 'prereq:'
                for p in prereq_list:
                    node_name = record.get_id().strip() + ' ' + record.get_title().strip()
                    if course_dict.get(p, '') == '':
                        g.add_node(p, name=p, type='prep')
                        g.add_edge(node_name, p)
                    else:
                        g.add_edge(node_name, p + ' ' + course_dict.get(p)[0].strip())

                    #prereq += p + ' '
                #print prereq
    return result_node_name
示例#6
0
def load_data(course):
    print 'generate graph for ' + course
    result_node_name = ''
    for data_file in data_file_list:
        f = open(data_file, 'rU')
        lines = f.readlines()
        data_file_lines_list.append(lines)
        node_type = data_file[data_file.find('db/') +
                              3:data_file.find('/',
                                               data_file.find('db/') + 3)]

        for line in lines:
            record = CourseRecord(line.strip())
            course_dict[record.get_id().strip()] = (record.get_title(),
                                                    record.get_url().strip())
            node_name = record.get_id().strip() + " " + record.get_title(
            ).strip()
            if record.get_id().strip() == course:
                result_node_name = node_name
            g.add_node(node_name, name=node_name, type=node_type)

    for lines in data_file_lines_list:
        for line in lines:
            record = CourseRecord(line.strip())
            prereq_list = []
            if record.get_prereq() != None:
                #print record.get_prereq()
                prereq_list = regex.findall(record.get_prereq().lower())

            if len(prereq_list) > 0:
                #prereq = 'prereq:'
                for p in prereq_list:
                    node_name = record.get_id().strip(
                    ) + ' ' + record.get_title().strip()
                    if course_dict.get(p, '') == '':
                        g.add_node(p, name=p, type='prep')
                        g.add_edge(node_name, p)
                    else:
                        g.add_edge(node_name,
                                   p + ' ' + course_dict.get(p)[0].strip())

                    #prereq += p + ' '
                #print prereq
    return result_node_name