Exemplo n.º 1
0
class HarvardSpider(Spider):    
    #harvard
    course_dict = {}
 
    def __init__(self):
        Spider.__init__(self)
        self.school = "harvard"
        self.url = "http://www.registrar.fas.harvard.edu"
        self.harvardOnlineSpider = HarvardOnlineSpider()

    def formatCourseNum(self, subject, oldTitle):
        title = oldTitle[0 : oldTitle.find(".")]
        if title.find("(") != -1:
            title = title[0 : title.find("(")].strip()
        title = title.replace("*", "").replace("[", "")
        if title.find(subject) != -1:
            course_num = title[len(subject) : ].strip()
            if subject.find(" ") != -1:
                words = re.compile("[A-Za-z]+").findall(subject) 
                pre = ""
                for word in words:
                    pre += word[0 : 1]
                return pre + course_num
            else:
                return subject + course_num
        return title

    def getHarvardCourse(self, subject, url):
        if self.need_update_subject(subject) == False:
            return
        r = requests.get(url)
        soup = BeautifulSoup(r.text)
        sys.setrecursionlimit(3000)
        file_name = self.get_file_name(subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0 
        online_course_dict = self.harvardOnlineSpider.getCourseDict(subject) 
        print len(online_course_dict)
        print "\n\nprocessing " + subject + " html and write data to file..."
        for p in soup.find_all("p"):
            if p != None:
                prereq = ''
                instructors = ''
                course_num = ''
                title = ''
                description = ''
                term = ''
                link = ''
                if p.strong != None and p.strong.a != None:
                    link = p.strong.a['href']
                for line in p.text.split('\n'):
                    if line.strip() != '' and line.startswith('Copyright') == False and line.startswith('.') == False and\
                        line.startswith('Catalog Number') == False:
                        line = line.replace("\n", '')
                        #print line
                        if line.find(subject) != -1 and line.find(subject) < 5:
                            course_num = self.formatCourseNum(subject, line)
                            title = line[line.find(".") + 2:].replace("]", "")
                            continue
                        if line.startswith('Half course'):
                            term = 'term:' + line + ' '
                            continue
                        if line.startswith('Prerequisite'):
                            prereq = 'prereq:' + line.replace("Prerequisite:", '').strip() + ' '
                            continue
                        if (len(line.strip()) > 40 and line.find('(' + subject + ')') == -1) or line.startswith('Note:'):
                            description = 'description:' + line + ' '
                            continue
                        instructors = 'instructors: ' + line + ' ' 


                description = instructors + prereq + term + description
                if course_num == '':
                    continue
                self.course_dict[course_num] = title
                self.count += 1
                if online_course_dict.get(course_num, '') != '':
                    description = 'features:Video lectures ' + description
                    if online_course_dict[course_num].get_url() != '':
                        link = online_course_dict[course_num].get_url()
                print course_num + ' ' + title + ' ' + link
                self.write_db(f, course_num, title, link, description)                      

        for node in soup.find_all("strong"):
            text = ""
            link = ""
            description = ''
            if node.string == None:
                if node.a != None and node.a.string != None:
                    text = node.a.string.replace("\n", "")
                    link = node.a["href"]
                else:
                    if node.a != None:
                        link = node.a["href"]
                    text = node.prettify()
                    if text.find("href=") > 0 :
                        text = text[text.find(">", 8) + 1 : text.find("<", text.find(">", 8)) - 1]
                    else:
                        text = text[text.find(">", 2) + 1 : text.find("<", 8) - 1]
                    text = text.replace("\n", "").strip()
            else:
                text = node.string.replace("\n", "")

            course_num = self.formatCourseNum(subject, text) 
            if self.course_dict.get(course_num, '') == '':
                title = text[text.find(".") + 2:].replace("]", "")
                self.count += 1
                if online_course_dict.get(course_num, '') != '':
                    description = 'features:Video lectures ' + online_course_dict[course_num].get_description()
                    if online_course_dict[course_num].get_url() != '':
                        link = online_course_dict[course_num].get_url()
                print course_num + ' ' + title + ' ' + link
                self.write_db(f, course_num, title, link, description)

        if self.count == 0:
            print subject + " can not get the data, check the html and python code"
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"
        
    def doWork(self): 
        print "downloading harvard course info"
        r = requests.get("http://www.registrar.fas.harvard.edu/courses-exams/courses-instruction")
        soup = BeautifulSoup(r.text)
    
        for span in soup.find_all("span", class_="field-content"):
            #print span.a.string
            self.getHarvardCourse(span.a.string, self.url + str(span.a["href"]))
Exemplo n.º 2
0
 def __init__(self):
     Spider.__init__(self)
     self.school = "harvard"
     self.url = "http://www.registrar.fas.harvard.edu"
     self.harvardOnlineSpider = HarvardOnlineSpider()
Exemplo n.º 3
0
 def __init__(self):
     Spider.__init__(self)
     self.school = "harvard"
     self.url = "http://www.registrar.fas.harvard.edu"
     self.harvardOnlineSpider = HarvardOnlineSpider()
Exemplo n.º 4
0
class HarvardSpider(Spider):
    #harvard
    course_dict = {}

    def __init__(self):
        Spider.__init__(self)
        self.school = "harvard"
        self.url = "http://www.registrar.fas.harvard.edu"
        self.harvardOnlineSpider = HarvardOnlineSpider()

    def formatCourseNum(self, subject, oldTitle):
        title = oldTitle[0:oldTitle.find(".")]
        if title.find("(") != -1:
            title = title[0:title.find("(")].strip()
        title = title.replace("*", "").replace("[", "")
        if title.find(subject) != -1:
            course_num = title[len(subject):].strip()
            if subject.find(" ") != -1:
                words = re.compile("[A-Za-z]+").findall(subject)
                pre = ""
                for word in words:
                    pre += word[0:1]
                return pre + course_num
            else:
                return subject + course_num
        return title

    def getHarvardCourse(self, subject, url):
        if self.need_update_subject(subject) == False:
            return
        r = requests.get(url)
        soup = BeautifulSoup(r.text)
        sys.setrecursionlimit(3000)
        file_name = self.get_file_name(subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        online_course_dict = self.harvardOnlineSpider.getCourseDict(subject)
        print len(online_course_dict)
        print "\n\nprocessing " + subject + " html and write data to file..."
        for p in soup.find_all("p"):
            if p != None:
                prereq = ''
                instructors = ''
                course_num = ''
                title = ''
                description = ''
                term = ''
                link = ''
                if p.strong != None and p.strong.a != None:
                    link = p.strong.a['href']
                for line in p.text.split('\n'):
                    if line.strip() != '' and line.startswith('Copyright') == False and line.startswith('.') == False and\
                        line.startswith('Catalog Number') == False:
                        line = line.replace("\n", '')
                        #print line
                        if line.find(subject) != -1 and line.find(subject) < 5:
                            course_num = self.formatCourseNum(subject, line)
                            title = line[line.find(".") + 2:].replace("]", "")
                            continue
                        if line.startswith('Half course'):
                            term = 'term:' + line + ' '
                            continue
                        if line.startswith('Prerequisite'):
                            prereq = 'prereq:' + line.replace(
                                "Prerequisite:", '').strip() + ' '
                            continue
                        if (len(line.strip()) > 40
                                and line.find('(' + subject + ')')
                                == -1) or line.startswith('Note:'):
                            description = 'description:' + line + ' '
                            continue
                        instructors = 'instructors: ' + line + ' '

                description = instructors + prereq + term + description
                if course_num == '':
                    continue
                self.course_dict[course_num] = title
                self.count += 1
                if online_course_dict.get(course_num, '') != '':
                    description = 'features:Video lectures ' + description
                    if online_course_dict[course_num].get_url() != '':
                        link = online_course_dict[course_num].get_url()
                print course_num + ' ' + title + ' ' + link
                self.write_db(f, course_num, title, link, description)

        for node in soup.find_all("strong"):
            text = ""
            link = ""
            description = ''
            if node.string == None:
                if node.a != None and node.a.string != None:
                    text = node.a.string.replace("\n", "")
                    link = node.a["href"]
                else:
                    if node.a != None:
                        link = node.a["href"]
                    text = node.prettify()
                    if text.find("href=") > 0:
                        text = text[text.find(">", 8) +
                                    1:text.find("<", text.find(">", 8)) - 1]
                    else:
                        text = text[text.find(">", 2) + 1:text.find("<", 8) -
                                    1]
                    text = text.replace("\n", "").strip()
            else:
                text = node.string.replace("\n", "")

            course_num = self.formatCourseNum(subject, text)
            if self.course_dict.get(course_num, '') == '':
                title = text[text.find(".") + 2:].replace("]", "")
                self.count += 1
                if online_course_dict.get(course_num, '') != '':
                    description = 'features:Video lectures ' + online_course_dict[
                        course_num].get_description()
                    if online_course_dict[course_num].get_url() != '':
                        link = online_course_dict[course_num].get_url()
                print course_num + ' ' + title + ' ' + link
                self.write_db(f, course_num, title, link, description)

        if self.count == 0:
            print subject + " can not get the data, check the html and python code"
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(
                self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def doWork(self):
        print "downloading harvard course info"
        r = requests.get(
            "http://www.registrar.fas.harvard.edu/courses-exams/courses-instruction"
        )
        soup = BeautifulSoup(r.text)

        for span in soup.find_all("span", class_="field-content"):
            #print span.a.string
            self.getHarvardCourse(span.a.string,
                                  self.url + str(span.a["href"]))