示例#1
0
 def __init__(self):
     Spider.__init__(self)    
     self.school = "mit"
     self.ocw_spider = MitOcwSpider()
     self.deep_mind = True
示例#2
0
class MitSpider(Spider):
    ocw_links = {}
    course_num_regex = re.compile(r'[0-9]+\.[0-9]+[a-z]*')
 
    def __init__(self):
        Spider.__init__(self)    
        self.school = "mit"
        self.ocw_spider = MitOcwSpider()
        self.deep_mind = True
   
    def initOcwLinks(self, subject):
        self.ocw_links = self.ocw_spider.getOcwLinks(subject)

    def getTextBook(self, course_num):
        terms = ['2015SP', '2014SP', '2013SP']
        for term in terms:
            r = requests.get('http://eduapps.mit.edu/textbook/books.html?Term=' + term + '&Subject=' + course_num)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                table = soup.find('table', class_='displayTable')
                if table == None:
                    continue
                splits = table.text.strip()[table.text.strip().find('Price') + 6 :].strip().split('\n')
                if splits[0] == 'Course Has No Materials':
                    continue
                return 'textbook:' + splits[1] + ' (' + splits[0] + ')' + ' '
        return ''

    def getMitCourseLink(self, links, course_num):
        if course_num == "":
            return course_num
        if self.ocw_links.get(course_num, '') != '':
            return self.ocw_links[course_num]

        for link in links:
            if link.attrs.has_key("href") and link["href"].find(course_num) != -1 and link["href"].find("editcookie.cgi") == -1:
                return link["href"]
        return ""
    
    def processStudentCatalog(self, html, f, course_code):
        #print html
        soup = BeautifulSoup(html);
        links_all = soup.find_all("a")
        course_links = []
        utils = Utils()
        for link in links_all:
            if link.attrs.has_key("href") and False == link["href"].startswith("editcookie.cgi") \
               and False == link["href"].startswith("/ent/cgi-bin") and False == link["href"].startswith("javascript:") \
               and False == link["href"].startswith("m"):
                course_links.append(link)
        course_num = ""
        title = ""
        link = ""
        textbook = ''
        prereq = ''
        instructors = ''
        for line in html.split("\n"):

            if (line.strip().startswith('<br>') and utils.clearHtmlTag(line.strip())[1 : 2] == '.') or \
                line.find('Prereq:') != -1:
                if line.find('Prereq:') != -1:
                    all_prereq = self.course_num_regex.findall(line.lower())
                    all_prereq = list(set(all_prereq))
                    for p in all_prereq:
                        prereq += p + ' '
                    if len(all_prereq) > 0:
                        prereq = 'prereq:' + prereq
                    #print course_num + '---->' + prereq
                        
                if line.strip().startswith('<') and utils.clearHtmlTag(line.strip())[1 : 2] == '.':
                    instructors = 'instructors:' + utils.clearHtmlTag(line.strip()[0 : line.strip().find('</')]) + ' '

            if line.strip().find('<h3>') != -1 or \
                (line.strip().startswith('<br>') and (line.strip()[len(line.strip()) - 1 : ] == '.' or line.strip()[len(line.strip()) - 7 : ] == 'limited')):
                line = line[line.find('>', 3) + 1 : ]
                if line.find('</h3>') == -1:
                    #print line
                    if line[0 : line.find('.')] == course_code:
                        if course_num != '':
                            print course_num + " " + title + " " + link                     

                            if instructors != '' and remark.find('instructors:') == -1:
                                remark = instructors + ' ' + remark

                            self.count += 1
                            self.write_db(f, course_num, title, link, remark)
                            remark = ''
                            course_num = ""
                            title = ""
                            link = ""
                            textbook = ''
                            prereq = ''
                            instructors = ''

                        course_num = line.strip()[0 : line.strip().find(" ")]
                        textbook = ''
                        if self.deep_mind:
                            textbook = self.getTextBook(course_num)

                        if textbook == '' and self.deep_mind and self.ocw_links.get(course_num, '') != '':
                            textbook = self.ocw_spider.getTextBook(self.ocw_links[course_num], course_num)
 
                        title = line.strip()[line.strip().find(" ") + 1 : ]
                        if course_num.find(',') != -1:
                            course_num = line.strip()[0 : line.strip().find(" ", line.strip().find(" ") + 1)]
                            title = line.strip()[line.strip().find(" ", line.strip().find(" ") + 1) + 1 : ]
                        link = self.getMitCourseLink(course_links, course_num.strip())
                    else:
                        remark = ''
                        if self.deep_mind and self.ocw_links.get(course_num, '') != '':
                            remark = self.ocw_spider.getDescription(self.ocw_spider.getDescriptionApiUrl(self.ocw_links[course_num]))
                            if remark.find('description:') != -1:
                                remark = remark[0 : remark.find('description:')]

                        if textbook != '':
                            remark += textbook
                        if prereq != '':
                            remark += prereq

                        remark += 'description:' + line.strip() + ' ' 
        if course_num != '':
            self.count = self.count + 1
            self.write_db(f, course_num, title, link, remark)

    def studentCatalog(self):
        #mit
        #"""
        r = requests.get('http://student.mit.edu/catalog/index.cgi')
        soup = BeautifulSoup(r.text)
        for a in soup.find_all('a'):
            if a['href'].find('http') == -1 and a['href'][0 : 1] == 'm' and a['href'][0 : 4] != 'mail':
                subject = a.text
                if subject.find('-') != -1:
                    subject = subject[subject.find('-') + 1 : ]
                if subject.find('(') != -1:
                    subject = subject[0 : subject.find('(')]
                subject = subject.strip()
                if self.need_update_subject(subject) == False:
                    continue 
        
                print 'init ocw course links'
                self.initOcwLinks(subject)


                print "downloading mit course info"
                file_name = self.get_file_name(subject, self.school)
                file_lines = self.countFileLineNum(file_name)
                f = self.open_db(file_name + ".tmp")
                self.count = 0
                print 'processing ' + a['href'] 
                r = requests.get("http://student.mit.edu/catalog/" + a['href'])
                print "processing html and write data to file..."
                course_code = a['href'][1 : a['href'].find('.html') - 1]
                #print 'course_code: ' + course_code
                self.processStudentCatalog(r.text, f, course_code)

                soup = BeautifulSoup(r.text);
                regex = re.compile(a['href'][0 : a['href'].find('.html') - 1] + '[b-z].html')
                for link in sorted(list(set(regex.findall(r.text)))):
                    print 'processing ' + link
                    r = requests.get("http://student.mit.edu/catalog/" + link)
                    self.processStudentCatalog(r.text, f, course_code) 
     
    
                self.close_db(f)
                if file_lines != self.count and self.count > 0:
                    self.do_upgrade_db(file_name)
                    print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
                else:
                    self.cancel_upgrade(file_name)
                    print "no need upgrade\n"
            #"""

    def processBulletinCatalog(self, url, subject):
        print 'process ' + subject
        self.initOcwLinks(subject)
        file_name = self.get_file_name(subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        r = requests.get(url)
        utils = Utils()
        soup = BeautifulSoup(r.text)
        for div in soup.find_all('div', class_="courseblock"):
            sp = BeautifulSoup(div.prettify())
            h4 = sp.find('h4', class_='courseblocktitle')
            p1 = sp.find('p', class_='courseblockextra')
            p2 = sp.find('p', class_='courseblockdesc')
            p3 = sp.find('p', class_='courseblockinstructors seemore')
            text = h4.text.replace('\n', '').replace('[J]', '').strip()
            course_num = text[0 : text.find(' ')]
            title = text[text.find(' ') + 1 : ].strip()
            textbook = ''
            if self.deep_mind:
                textbook = self.getTextBook(course_num)

            if textbook == '' and self.deep_mind and self.ocw_links.get(course_num, '') != '':
                textbook = self.ocw_spider.getTextBook(self.ocw_links[course_num], course_num)

            link = self.getMitCourseLink([], course_num)
            preq = ''
            same_subject = ''
            desc = ''
            instructors = ''
            remark = ''
            units = ''
            if p1 != None:
                sp_1 = BeautifulSoup(p1.prettify())
                span = sp_1.find('span', class_='courseblockprereq')
                if span != None:
                    preq = utils.removeDoubleSpace(span.text.replace('\n', '').replace(',','').replace(';','').strip().lower())
                span = sp_1.find('span', class_='courseblockcluster')
                if span != None:
                    same_subject = utils.removeDoubleSpace(span.text.replace('\n', '').strip())
                span = sp_1.find('span', class_='courseblockterms')
                if span != None:
                    term = utils.removeDoubleSpace(span.text.replace('\n', '').strip())
                span = sp_1.find('span', class_='courseblockhours')
                if span != None:
                    units = utils.removeDoubleSpace(span.text.replace('\n', '').strip())

            if p2 != None:
                desc = utils.removeDoubleSpace(p2.text.replace('\n', '').strip())
            if p3 != None:
                instructors = utils.removeDoubleSpace(p3.text.replace('\n', '').strip())

            if self.deep_mind and self.ocw_links.get(course_num, '') != '':
                remark = self.ocw_spider.getDescription2(self.ocw_spider.getDescriptionApiUrl(self.ocw_links[course_num])) + ' '

            if preq != '':
                remark += preq.replace(': ', ':') + ' '
            if instructors != '':
                remark += 'instructors:' + instructors + ' '
            if term != '':
                remark += 'term:' + term + ' '
            if textbook != '':
                remark += '' + textbook + ' '
            if desc != '':
                remark += 'description:' + desc + ' ' + same_subject + ' ' + units
            print text 
            self.count += 1
            self.write_db(f, course_num, title, link, remark)
            #print remark
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def bulletinCatalog(self):
        r = requests.get('http://catalog.mit.edu/subjects/#bycoursenumbertext')
        soup = BeautifulSoup(r.text)
        div = soup.find('div', class_='notinpdf')
        soup = BeautifulSoup(div.prettify())
        for a in soup.find_all('a', class_='sitemaplink'):
            subject = a.text.replace('Course', '').strip()
            subject = subject[3 :].strip()
            if subject[0 : 1] == "/":
                subject = subject[2 :].strip()
            if self.need_update_subject(subject) == False:
                continue
            self.processBulletinCatalog("http://catalog.mit.edu" + a['href'], subject)
    def doWork(self):
        #self.studentCatalog()
        self.bulletinCatalog()
示例#3
0
class MitSpider(Spider):
    ocw_links = {}
    course_num_regex = re.compile(r'[0-9]+\.[0-9]+[a-z]*')

    def __init__(self):
        Spider.__init__(self)
        self.school = "mit"
        self.ocw_spider = MitOcwSpider()
        self.deep_mind = True

    def initOcwLinks(self, subject):
        self.ocw_links = self.ocw_spider.getOcwLinks(subject)

    def getTextBook(self, course_num):
        '''
        terms = ['2015SP', '2014SP', '2013SP']
        for term in terms:
            r = requests.get('http://eduapps.mit.edu/textbook/books.html?Term=' + term + '&Subject=' + course_num)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                table = soup.find('table', class_='displayTable')
                if table == None:
                    continue
                splits = table.text.strip()[table.text.strip().find('Price') + 6 :].strip().split('\n')
                if splits[0] == 'Course Has No Materials':
                    continue
                return 'textbook:' + splits[1] + ' (' + splits[0] + ')' + ' '
        '''
        return ''

    def getMitCourseLink(self, links, course_num):
        if course_num == "":
            return course_num
        if self.ocw_links.get(course_num, '') != '':
            return self.ocw_links[course_num]

        for link in links:
            if link.attrs.has_key("href") and link["href"].find(
                    course_num) != -1 and link["href"].find(
                        "editcookie.cgi") == -1:
                return link["href"]
        return ""

    def processStudentCatalog(self, html, f, course_code):
        #print html
        soup = BeautifulSoup(html)
        links_all = soup.find_all("a")
        course_links = []
        utils = Utils()
        for link in links_all:
            if link.attrs.has_key("href") and False == link["href"].startswith("editcookie.cgi") \
               and False == link["href"].startswith("/ent/cgi-bin") and False == link["href"].startswith("javascript:") \
               and False == link["href"].startswith("m"):
                course_links.append(link)
        course_num = ""
        title = ""
        link = ""
        textbook = ''
        prereq = ''
        instructors = ''
        for line in html.split("\n"):

            if (line.strip().startswith('<br>') and utils.clearHtmlTag(line.strip())[1 : 2] == '.') or \
                line.find('Prereq:') != -1:
                if line.find('Prereq:') != -1:
                    all_prereq = self.course_num_regex.findall(line.lower())
                    all_prereq = list(set(all_prereq))
                    for p in all_prereq:
                        prereq += p + ' '
                    if len(all_prereq) > 0:
                        prereq = 'prereq:' + prereq
                    #print course_num + '---->' + prereq

                if line.strip().startswith('<') and utils.clearHtmlTag(
                        line.strip())[1:2] == '.':
                    instructors = 'instructors:' + utils.clearHtmlTag(
                        line.strip()[0:line.strip().find('</')]) + ' '

            if line.strip().find('<h3>') != -1 or \
                (line.strip().startswith('<br>') and (line.strip()[len(line.strip()) - 1 : ] == '.' or line.strip()[len(line.strip()) - 7 : ] == 'limited')):
                line = line[line.find('>', 3) + 1:]
                if line.find('</h3>') == -1:
                    #print line
                    if line[0:line.find('.')] == course_code:
                        if course_num != '':
                            print course_num + " " + title + " " + link

                            if instructors != '' and remark.find(
                                    'instructors:') == -1:
                                remark = instructors + ' ' + remark

                            self.count += 1
                            self.write_db(f, course_num, title, link, remark)
                            remark = ''
                            course_num = ""
                            title = ""
                            link = ""
                            textbook = ''
                            prereq = ''
                            instructors = ''

                        course_num = line.strip()[0:line.strip().find(" ")]
                        textbook = ''
                        if self.deep_mind:
                            textbook = self.getTextBook(course_num)

                        if textbook == '' and self.deep_mind and self.ocw_links.get(
                                course_num, '') != '':
                            textbook = self.ocw_spider.getTextBook(
                                self.ocw_links[course_num], course_num)

                        title = line.strip()[line.strip().find(" ") + 1:]
                        if course_num.find(',') != -1:
                            course_num = line.strip()[0:line.strip(
                            ).find(" ",
                                   line.strip().find(" ") + 1)]
                            title = line.strip(
                            )[line.strip().find(" ",
                                                line.strip().find(" ") + 1) +
                              1:]
                        link = self.getMitCourseLink(course_links,
                                                     course_num.strip())
                    else:
                        remark = ''
                        if self.deep_mind and self.ocw_links.get(
                                course_num, '') != '':
                            remark = self.ocw_spider.getDescription(
                                self.ocw_spider.getDescriptionApiUrl(
                                    self.ocw_links[course_num]))
                            if remark.find('description:') != -1:
                                remark = remark[0:remark.find('description:')]

                        if textbook != '':
                            remark += textbook
                        if prereq != '':
                            remark += prereq

                        remark += 'description:' + line.strip() + ' '
        if course_num != '':
            self.count = self.count + 1
            self.write_db(f, course_num, title, link, remark)

    def studentCatalog(self):
        #mit
        #"""
        r = requests.get('http://student.mit.edu/catalog/index.cgi')
        soup = BeautifulSoup(r.text)
        for a in soup.find_all('a'):
            if a['href'].find('http') == -1 and a['href'][
                    0:1] == 'm' and a['href'][0:4] != 'mail':
                subject = a.text
                if subject.find('-') != -1:
                    subject = subject[subject.find('-') + 1:]
                if subject.find('(') != -1:
                    subject = subject[0:subject.find('(')]
                subject = subject.strip()
                if self.need_update_subject(subject) == False:
                    continue

                print 'init ocw course links'
                self.initOcwLinks(subject)

                print "downloading mit course info"
                file_name = self.get_file_name(subject, self.school)
                file_lines = self.countFileLineNum(file_name)
                f = self.open_db(file_name + ".tmp")
                self.count = 0
                print 'processing ' + a['href']
                r = requests.get("http://student.mit.edu/catalog/" + a['href'])
                print "processing html and write data to file..."
                course_code = a['href'][1:a['href'].find('.html') - 1]
                #print 'course_code: ' + course_code
                self.processStudentCatalog(r.text, f, course_code)

                soup = BeautifulSoup(r.text)
                regex = re.compile(a['href'][0:a['href'].find('.html') - 1] +
                                   '[b-z].html')
                for link in sorted(list(set(regex.findall(r.text)))):
                    print 'processing ' + link
                    r = requests.get("http://student.mit.edu/catalog/" + link)
                    self.processStudentCatalog(r.text, f, course_code)

                self.close_db(f)
                if file_lines != self.count and self.count > 0:
                    self.do_upgrade_db(file_name)
                    print "before lines: " + str(
                        file_lines) + " after update: " + str(
                            self.count) + " \n\n"
                else:
                    self.cancel_upgrade(file_name)
                    print "no need upgrade\n"
            #"""

    def processBulletinCatalog(self, url, subject):
        print 'process ' + subject
        self.initOcwLinks(subject)
        file_name = self.get_file_name(subject, self.school)
        file_lines = self.countFileLineNum(file_name)
        f = self.open_db(file_name + ".tmp")
        self.count = 0
        r = requests.get(url)
        utils = Utils()
        soup = BeautifulSoup(r.text)
        for div in soup.find_all('div', class_="courseblock"):
            sp = BeautifulSoup(div.prettify())
            h4 = sp.find('h4', class_='courseblocktitle')
            p1 = sp.find('p', class_='courseblockextra')
            p2 = sp.find('p', class_='courseblockdesc')
            p3 = sp.find('p', class_='courseblockinstructors seemore')
            text = h4.text.replace('\n', '').replace('[J]', '').strip()
            course_num = text[0:text.find(' ')]
            title = text[text.find(' ') + 1:].strip()
            textbook = ''
            if self.deep_mind:
                textbook = self.getTextBook(course_num)

            if textbook == '' and self.deep_mind and self.ocw_links.get(
                    course_num, '') != '':
                textbook = self.ocw_spider.getTextBook(
                    self.ocw_links[course_num], course_num)

            link = self.getMitCourseLink([], course_num)
            preq = ''
            same_subject = ''
            desc = ''
            instructors = ''
            remark = ''
            units = ''
            if p1 != None:
                sp_1 = BeautifulSoup(p1.prettify())
                span = sp_1.find('span', class_='courseblockprereq')
                if span != None:
                    preq = utils.removeDoubleSpace(
                        span.text.replace('\n', '').replace(',', '').replace(
                            ';', '').strip().lower())
                span = sp_1.find('span', class_='courseblockcluster')
                if span != None:
                    same_subject = utils.removeDoubleSpace(
                        span.text.replace('\n', '').strip())
                span = sp_1.find('span', class_='courseblockterms')
                if span != None:
                    term = utils.removeDoubleSpace(
                        span.text.replace('\n', '').strip())
                span = sp_1.find('span', class_='courseblockhours')
                if span != None:
                    units = utils.removeDoubleSpace(
                        span.text.replace('\n', '').strip())

            if p2 != None:
                desc = utils.removeDoubleSpace(
                    p2.text.replace('\n', '').strip())
            if p3 != None:
                instructors = utils.removeDoubleSpace(
                    p3.text.replace('\n', '').strip())

            if self.deep_mind and self.ocw_links.get(course_num, '') != '':
                remark = self.ocw_spider.getDescription2(
                    self.ocw_spider.getDescriptionApiUrl(
                        self.ocw_links[course_num])) + ' '

            if preq != '':
                remark += preq.replace(': ', ':') + ' '
            if instructors != '':
                remark += 'instructors:' + instructors + ' '
            if term != '':
                remark += 'term:' + term + ' '
            if textbook != '':
                remark += '' + textbook + ' '
            if desc != '':
                remark += 'description:' + desc + ' ' + same_subject + ' ' + units
            print text
            self.count += 1
            self.write_db(f, course_num, title, link, remark)
            #print remark
        self.close_db(f)
        if file_lines != self.count and self.count > 0:
            self.do_upgrade_db(file_name)
            print "before lines: " + str(file_lines) + " after update: " + str(
                self.count) + " \n\n"
        else:
            self.cancel_upgrade(file_name)
            print "no need upgrade\n"

    def bulletinCatalog(self):
        r = requests.get('http://catalog.mit.edu/subjects/#bycoursenumbertext')
        soup = BeautifulSoup(r.text)
        div = soup.find('div', class_='notinpdf')
        soup = BeautifulSoup(div.prettify())
        for a in soup.find_all('a', class_='sitemaplink'):
            subject = a.text.replace('Course', '').strip()
            subject = subject[3:].strip()
            if subject[0:1] == "/":
                subject = subject[2:].strip()
            if self.need_update_subject(subject) == False:
                continue
            self.processBulletinCatalog("http://catalog.mit.edu" + a['href'],
                                        subject)

    def doWork(self):
        #self.studentCatalog()
        self.bulletinCatalog()
示例#4
0
 def __init__(self):
     Spider.__init__(self)
     self.school = "mit"
     self.ocw_spider = MitOcwSpider()
     self.deep_mind = True
示例#5
0
class MitSpider(Spider):
    ocw_links = {}
    course_num_regex = re.compile(r'[0-9]+\.[0-9]+[a-z]*')

    def __init__(self):
        Spider.__init__(self)
        self.school = "mit"
        self.ocw_spider = MitOcwSpider()
        self.deep_mind = True

    def initOcwLinks(self, subject):
        self.ocw_links = self.ocw_spider.getOcwLinks(subject)

    def getTextBook(self, course_num):
        terms = ['2014SP', '2013SP', '2012SP']
        for term in terms:
            r = requests.get(
                'http://eduapps.mit.edu/textbook/books.html?Term=' + term +
                '&Subject=' + course_num)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                table = soup.find('table', class_='displayTable')
                if table == None:
                    continue
                splits = table.text.strip()[table.text.strip().find('Price') +
                                            6:].strip().split('\n')
                if splits[0] == 'Course Has No Materials':
                    continue
                return 'textbook:' + splits[1] + ' (' + splits[0] + ')' + ' '
        return ''

    def getMitCourseLink(self, links, course_num):
        if course_num == "":
            return course_num
        if self.ocw_links.get(course_num, '') != '':
            return self.ocw_links[course_num]

        for link in links:
            if link.attrs.has_key("href") and link["href"].find(
                    course_num) != -1 and link["href"].find(
                        "editcookie.cgi") == -1:
                return link["href"]
        return ""

    def processMitData(self, html, f, course_code):
        #print html
        soup = BeautifulSoup(html)
        links_all = soup.find_all("a")
        course_links = []
        utils = Utils()
        for link in links_all:
            if link.attrs.has_key("href") and False == link["href"].startswith("editcookie.cgi") \
               and False == link["href"].startswith("/ent/cgi-bin") and False == link["href"].startswith("javascript:") \
               and False == link["href"].startswith("m"):
                course_links.append(link)
        course_num = ""
        title = ""
        link = ""
        textbook = ''
        prereq = ''
        instructors = ''
        for line in html.split("\n"):

            if (line.strip().startswith('<br>') and utils.clearHtmlTag(line.strip())[1 : 2] == '.') or \
                line.find('Prereq:') != -1:
                if line.find('Prereq:') != -1:
                    all_prereq = self.course_num_regex.findall(line.lower())
                    all_prereq = list(set(all_prereq))
                    for p in all_prereq:
                        prereq += p + ' '
                    if len(all_prereq) > 0:
                        prereq = 'prereq:' + prereq
                    #print course_num + '---->' + prereq

                if line.strip().startswith('<') and utils.clearHtmlTag(
                        line.strip())[1:2] == '.':
                    instructors = 'instructors:' + utils.clearHtmlTag(
                        line.strip()[0:line.strip().find('</')]) + ' '

            if line.strip().find('<h3>') != -1 or \
                (line.strip().startswith('<br>') and (line.strip()[len(line.strip()) - 1 : ] == '.' or line.strip()[len(line.strip()) - 7 : ] == 'limited')):
                line = line[line.find('>', 3) + 1:]
                if line.find('</h3>') == -1:
                    #print line
                    if line[0:line.find('.')] == course_code:
                        if course_num != '':
                            print course_num + " " + title + " " + link

                            if instructors != '' and remark.find(
                                    'instructors:') == -1:
                                remark = instructors + ' ' + remark

                            self.count += 1
                            self.write_db(f, course_num, title, link, remark)
                            remark = ''
                            course_num = ""
                            title = ""
                            link = ""
                            textbook = ''
                            prereq = ''
                            instructors = ''

                        course_num = line.strip()[0:line.strip().find(" ")]
                        textbook = ''
                        if self.deep_mind:
                            textbook = self.getTextBook(course_num)

                        if textbook == '' and self.deep_mind and self.ocw_links.get(
                                course_num, '') != '':
                            textbook = self.ocw_spider.getTextBook(
                                self.ocw_links[course_num], course_num)

                        title = line.strip()[line.strip().find(" ") + 1:]
                        if course_num.find(',') != -1:
                            course_num = line.strip()[0:line.strip(
                            ).find(" ",
                                   line.strip().find(" ") + 1)]
                            title = line.strip(
                            )[line.strip().find(" ",
                                                line.strip().find(" ") + 1) +
                              1:]
                        link = self.getMitCourseLink(course_links,
                                                     course_num.strip())
                    else:
                        remark = ''
                        if self.deep_mind and self.ocw_links.get(
                                course_num, '') != '':
                            remark = self.ocw_spider.getDescription(
                                self.ocw_spider.getDescriptionApiUrl(
                                    self.ocw_links[course_num]))
                            if remark.find('description:') != -1:
                                remark = remark[0:remark.find('description:')]

                        if textbook != '':
                            remark += textbook
                        if prereq != '':
                            remark += prereq

                        remark += 'description:' + line.strip() + ' '
        if course_num != '':
            self.count = self.count + 1
            self.write_db(f, course_num, title, link, remark)

    def doWork(self):
        #mit
        #"""
        r = requests.get('http://student.mit.edu/catalog/index.cgi')
        soup = BeautifulSoup(r.text)
        for a in soup.find_all('a'):
            if a['href'].find('http') == -1 and a['href'][
                    0:1] == 'm' and a['href'][0:4] != 'mail':
                subject = a.text
                if subject.find('-') != -1:
                    subject = subject[subject.find('-') + 1:]
                if subject.find('(') != -1:
                    subject = subject[0:subject.find('(')]
                subject = subject.strip()
                if self.need_update_subject(subject) == False:
                    continue

                print 'init ocw course links'
                self.initOcwLinks(subject)

                print "downloading mit course info"
                file_name = self.get_file_name(subject, self.school)
                file_lines = self.countFileLineNum(file_name)
                f = self.open_db(file_name + ".tmp")
                self.count = 0
                print 'processing ' + a['href']
                r = requests.get("http://student.mit.edu/catalog/" + a['href'])
                print "processing html and write data to file..."
                course_code = a['href'][1:a['href'].find('.html') - 1]
                #print 'course_code: ' + course_code
                self.processMitData(r.text, f, course_code)

                soup = BeautifulSoup(r.text)
                regex = re.compile(a['href'][0:a['href'].find('.html') - 1] +
                                   '[b-z].html')
                for link in sorted(list(set(regex.findall(r.text)))):
                    print 'processing ' + link
                    r = requests.get("http://student.mit.edu/catalog/" + link)
                    self.processMitData(r.text, f, course_code)

                self.close_db(f)
                if file_lines != self.count and self.count > 0:
                    self.do_upgrade_db(file_name)
                    print "before lines: " + str(
                        file_lines) + " after update: " + str(
                            self.count) + " \n\n"
                else:
                    self.cancel_upgrade(file_name)
                    print "no need upgrade\n"