def parse_syllabus(session, page, reverse=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll(attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" section_name = clean_filename(stag.contents[0].contents[1]) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" vname = clean_filename(vtag.a.contents[0]) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) title = clean_filename(a.get('title', '')) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if (count == i + 1): # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('Probably bad cookies file (or wrong class name)') return sections
def parse_old_style_syllabus(session, page, reverse=False, intact_fnames=False, subtitle_language="en"): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={"class": re.compile("^course-item-list-header")}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll("li"): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(" %s", vname) lecture = {} lecture_page = None for a in vtag.findAll("a"): href = fix_url(a["href"]) untouched_fname = a.get("title", "") title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) if fmt in ("srt", "txt") and subtitle_language != "en": title = title.replace("_en&format", "_" + subtitle_language + "&format") href = href.replace("_en&format", "_" + subtitle_language + "&format") logging.debug(" %s %s", fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture["mp4"] = lecture.get("mp4", []) lecture["mp4"].append((fix_url(href), "")) except TypeError: logging.warn("Could not get resource: %s", lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if "mp4" not in lecture: for a in vtag.findAll("a"): if a.get("data-modal-iframe"): href = grab_hidden_video_url(session, a["data-modal-iframe"]) href = fix_url(href) fmt = "mp4" logging.debug(" %s %s", fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, "")) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], "") else: # make sure the title is unique lecture[fmt][i] = (r[0], "{0:d}_{1}".format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info("Found %d sections and %d lectures on this page", len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error("The cookies file may be invalid, " "please re-run with the `--clear-cache` option.") return sections
def parse_syllabus(session, page, reverse=False, intact_fnames=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll( attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn('Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url(session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False, subtitle_language='en'): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, unrestricted_filenames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, unrestricted_filenames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, unrestricted_filenames) fmt = get_anchor_format(href) if fmt in ('srt', 'txt') and subtitle_language != 'en': title = title.replace('_en&format', '_' + subtitle_language + '&format') href = href.replace('_en&format', '_' + subtitle_language + '&format') logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warning( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections