def read_and_stock_dict_course_items_from_webroot_source(self, course_items_source_htmlwebroot_filename=None):
   '''
   1st read option: Course items are withdrawn from HTML Webroot file source
   '''
   course_items_source_htmlwebroot_filename = self.return_course_items_source_htmlwebroot_filename_or_default_or_raise(course_items_source_htmlwebroot_filename)
   text = open(course_items_source_htmlwebroot_filename).read()
   re_find_obj = self.re_compiled_text_to_find.finditer(text)
   for each_re_found in re_find_obj:
     course_id    = each_re_found.group(1)
     course_n_seq = each_re_found.group(2)
     try:
       # this below should not raise ValueError, if it does, continue without adding it to dict
       int(course_n_seq)
     except ValueError:
       continue
     if self.unique_course_id_dict.has_key(course_id):
       continue
     try:
       coursera_item_obj = CourseraCourse.objects.get(cid=course_id) #(course_id, course_n_seq)
     except CourseraCourse.DoesNotExist:
       coursera_item_obj = CourseraCourse()
       coursera_item_obj.cid = course_id
       coursera_item_obj.n_seq = course_n_seq
       coursera_item_obj.save()
     self.unique_course_id_dict[course_id] = coursera_item_obj
def scrape(html_text):
  
  root = ET.fromstring(html_text)
  divsL1 = root.iter('div')
  for divL1 in divsL1:
    data_course_div = divL1.get('data-course-id')
    if data_course_div == None:
      continue
    divsL2 = data_course_div.iter('div')
    for divL2 in divsL2:
      div_with_class_attr = divL2.get('coursera-course-listing-text')
      if div_with_class_attr == None:
        continue 
      divsL3 = divL2.iter('div')
      for divL3 in divsL3:
        listing_main_div = divL3.get('coursera-course-listing-main')
        if listing_main_div == None:
          continue
        h3_tag = listing_main_div.find('h3')
        a_tag = h3_tag.find('a')
        href = a_tag.get('href')
        pp = href.split('/')
        cid_n_seq = pp[3]
        pp = cid_n_seq.split('-')
        n_seq = pp[-1]
        cid = '-'.join(pp[:-1])
        course = CourseraCourse()
        course.title = a_tag.text
        course.cid = cid
        course.n_seq = int(n_seq)
        divsL4 = divL3.iter('div')
        for divL4 in divsL4:
          listing_progress_div = divL4.get('coursera-course-listing-progress')
          if listing_progress_div != None:
            start_date_span = listing_progress_div.find('span')
            if start_date_span != None:
              date_text = start_date_span.text
              pp = date_text.split(' ')
              month_str = pp[0]
              day_str = pp[-1]
              day_str = day_str[:-2]
              month = timeutils.array_3letter_months_english.index(month_str)
              month += 1
          listing_statement_div = divL4.get('coursera-course-listing-statement')
          if listing_statement_div != None:
            outter_div_for_university = listing_statement_div.find('div')
            university_a_tag = outter_div_for_university.find('a')
            university_class_attr = university_a_tag.get('class')
            if university_class_attr != None:
              # ok, it confirms we're in the right <div /> !
              university_name = university_a_tag.text
              institution = Institution.objects.get(name=university_name)
              course.institutions.add(institution) 
Exemplo n.º 3
0
 def get_course_or_create_it_or_None(self, cid, n_seq):
     if n_seq == 0:
         return None
     try:
         ccourse = CourseraCourse.objects.get(cid=cid)
     except CourseraCourse.DoesNotExist:
         ccourse = CourseraCourse()
         ccourse.cid = cid
         ccourse.n_seq = n_seq
         ccourse.save()
         return ccourse
     if ccourse.n_seq == -1:
         ccourse.n_seq = n_seq
     if ccourse.n_seq != n_seq:
         return None
     return ccourse
 def save_courses_subset_to_db(self):
   for i, course_subset in enumerate(self.courses_subset):
     #if course_subset.university == None:
       #continue
     try:
       print str(i+1).zfill(3), 'Saving to db', course_subset
     except UnicodeEncodeError:
       print str(i+1).zfill(3), 'Saving to db'
     course = CourseraCourse()
     course.cid = course_subset.cid
     course.n_seq = course_subset.get_n_seq()
     course.title = course_subset.title
     if course_subset.start_date != None:
       course.start_date = course_subset.start_date
     if course_subset.duration_in_weeks != None:
       course.duration_in_weeks = course_subset.duration_in_weeks
     #course.save()
     if course_subset.university != None:
       university_name = course_subset.university
       try:
         institution = Institution.objects.get(name=university_name)
       except Institution.DoesNotExist:
         institution = Institution()
         institution.name = university_name
         #institution.courseracourse_id = course.cid
         institution.save()
         print 'institution id', institution.id
         #CourseraCourse.objects.create(name=university_name)
       except AttributeError, e:
         print 'university_name', university_name
         print e
         #sys.exit(0)
         pass
       #course.in
       #course.institutions.add(institution.id)
     course.save()
def make_test_course():

  course = CourseraCourse()
  course.cid = 'introstats2'
  course.n_seq = 1 # '001'

  course.title = 'Introduction to Statistics'
  course.description = 'Introduction to Statistics is nice course!'

  course.start_date = datetime.date(2013, 4, 5)
  course.duration_in_weeks = 8
  # course.workload_in_hours_per_day = 3
  course.workload_in_hours_per_week = 3

  institution = Institution()
  institution.id = 10
  institution.name = 'Harvard Univ.'
  course.institutions = [institution] 

  professor =  Instructor()
  professor.id = 10
  professor.name = 'John Joey'
  professor.institution = institution
  course.instructors = [professor]

  category =  Category()
  category.id = 10
  category.name = 'Mathematics & Statistics'
  course.categories = [category]
  
  print 'course', course
  print 'Instructors', course.instructors.values()

  return course