Exemplo n.º 1
0
def extract_units(url, headers, file_formats):
	"""
	Parses a webpage and extracts its resources e.g. video_url, sub_url, etc.
	"""
	#logging.info("Processing '%s'", url)

	page = get_page_contents(url, headers)
	page_extractor = get_page_extractor(url)
	units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats)
	return units
Exemplo n.º 2
0
def get_available_sections(url, headers):
    """
	Extracts the sections and subsections from a given url
	"""
    logging.debug("Extracting sections for :" + url)

    page = get_page_contents(url, headers)
    page_extractor = get_page_extractor(url)
    sections = page_extractor.extract_sections_from_html(page, BASE_URL)

    logging.debug("Extracted sections: " + str(sections))
    return sections
Exemplo n.º 3
0
def get_courses_info(url, headers):
    """
	Extracts the courses information from the dashboard.
	"""
    logging.info('Extracting course information from dashboard.')

    page = get_page_contents(url, headers)
    page_extractor = get_page_extractor(url)
    courses = page_extractor.extract_courses_from_html(page, BASE_URL)

    logging.debug('Data extracted: %s', courses)

    return courses
Exemplo n.º 4
0
def extract_video_component(args,coursename,headers,soup,section,subsection,unit):	
	
	video_flag = soup.findAll("div", {"data-block-type": "video"})
	video_meta_list = []
	for video_comp in video_flag:
		video_meta = dict()
		txtjson = video_comp.find('div',{"data-metadata":True})['data-metadata']
		txt2dict = json.loads(txtjson)
		yt_id = re.sub(r"1.00:", '', txt2dict['streams'])
		yt_link = 'https://youtu.be/'+ yt_id
		duration = videolen(yt_link)

		video_meta.update({'section': section , 'subsection': subsection, 'unit_idx': unit, 'youtube_url':yt_link, 'video_duration':duration})
		for key, value in txt2dict['transcriptLanguages'].items():
			transcript_name = 'transcript_'+ key
			transcript_url = 'https://courses.edx.org/' + re.sub(r"__lang__",key, txt2dict['transcriptTranslationUrl']) 
			print('download '+ value + ' transcript of '+ yt_link)
			try:
				transcript_dump = get_page_contents(transcript_url, headers)
				transcript_raw = json.loads(transcript_dump)
				#print (transcript_raw)
				video_meta.update({transcript_name:transcript_raw['text']})
			except (HTTPError,URLError) as exception:
				print('     bug: cannot download from edx site')
				transcript_dump = YT_transcript(yt_link,key)
				if len(transcript_dump) == 0:
					print('     no transcript available on YouTube')
					video_meta.update({transcript_name:{"start":'',"end":'',"text":''}})
					logging.warn('transcript (error: %s)', exception)
					errorlog = os.path.join(args.html_dir,coursename,'transcript_error_report.txt')
					f = open(errorlog, 'a')
					text = '---------------------------------\n'\
					+ 'transcript error: ' + str(exception) +'\n' \
					+ 'video url: '+ yt_link +'\n' \
					+ 'language: ' + value + '\n' \
					+ 'section:  ' + section + '\n'\
					+ 'subsection: ' + subsection + '\n'\
					+ 'unit_idx: ' + unit + '\n' \
					+'---------------------------------'
					f.write(text)
					f.close()
				else:
					print('     transcript was successfuly downloaded from YouTube')
					video_meta.update({transcript_name:transcript_dump['text']})

		video_meta_list.append(video_meta)
	return video_meta_list
Exemplo n.º 5
0
def save_html_to_file(args, selections, all_urls, headers):

    sub_idx = 0
    prob_type_set = []
    counter_video = 1

    for selected_course, selected_sections in selections.items():
        coursename = directory_name(selected_course.name)

        for selected_section in selected_sections:
            section_dirname = "%02d-%s" % (selected_section.position,
                                           selected_section.name)
            target_dir = os.path.join(args.html_dir, coursename,
                                      clean_filename(section_dirname))
            mkdir_p(target_dir)

            for subsection in selected_section.subsections:

                if subsection.name == None:
                    subsection.name = 'Untitled'
                target_subdir = os.path.join(
                    target_dir,
                    str(sub_idx).zfill(3) + '-' +
                    clean_filename(subsection.name))
                mkdir_p(target_subdir)
                logging.info('url: ' + str(all_urls[sub_idx]) +
                             ', subsection: ' + str(sub_idx).zfill(3) + '-' +
                             str(subsection.name))
                page = get_page_contents(str(all_urls[sub_idx]), headers)
                soup = BeautifulSoup(page, "html.parser")

                #div contains all units (seq_contents_#)
                main_content = soup.find("div", {"class": "container"})

                units = crawl_units(main_content)
                counter = 0
                sub_idx = sub_idx + 1

                for unit in units:

                    filename_template = "seq_contents_" + str(
                        counter) + ".html"
                    filename = os.path.join(target_subdir, filename_template)

                    filename_template_txt = "seq_contents_" + str(
                        counter) + ".txt"
                    filename_txt = os.path.join(target_subdir,
                                                filename_template_txt)

                    filename_template_prob_txt = "seq_contents_" + str(
                        counter) + "_prob.txt"
                    filename_prob_txt = os.path.join(
                        target_subdir, filename_template_prob_txt)

                    filename_template_video_json = "seq_contents_" + str(
                        counter) + "_vdo.json"
                    filename_video_json = os.path.join(
                        target_subdir, filename_template_video_json)

                    logging.info('path: ' + str(target_subdir) +
                                 ', filename: ' + str(filename))

                    try:
                        file_ = sys.stdout if filename == '-' else codecs.open(
                            filename, 'w', 'utf-8')
                    except IOError as exc:
                        f = open('downloading_error_report.txt', 'a')
                        text = 'External command error ignored: ' + str(
                            exc) + '\n\n'
                        f.write(text)
                        f.close()
                        file_ = sys.stdout if filename == '-' else codecs.open(
                            filename_template, 'w', 'utf-8')

                    file_.writelines(unit.prettify(formatter=None))
                    file_.close()

                    soup = unit.prettify(formatter=None)
                    soup = BeautifulSoup(soup, "html.parser")

                    # select only html componert (disregard video, problem)
                    html_flag = soup.findAll("div",
                                             {"data-block-type": "html"})
                    if len(html_flag) > 0:

                        #create file only when html component exists
                        file_txt = sys.stdout if filename_txt == '-' else codecs.open(
                            filename_txt, 'w', 'utf-8')
                        text = ""
                        for soup_component in html_flag:
                            for s in soup_component.findAll([
                                    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p',
                                    'li'
                            ]):
                                text += s.getText() + " "

                        file_txt.writelines(text)
                        file_txt.close()
                        print(filename_txt + ' of text component was created')

                    # select only problem componert (disregard video, text)
                    prob_txt, prob_types = extract_problem_comp(soup)

                    if len(prob_txt) > 0:
                        file_prob_txt = sys.stdout if filename == '-' else codecs.open(
                            filename_prob_txt, 'w', 'utf-8')
                        for prob_type in prob_types:
                            prob_type_set.append(prob_type + ' \n')

                        file_prob_txt.writelines(prob_txt)
                        file_prob_txt.close()
                        print(filename_prob_txt +
                              ' of problem component was created')

                    tmp_video_dict = extract_video_component(
                        args, coursename, headers, soup,
                        clean_filename(section_dirname),
                        clean_filename(subsection.name),
                        "seq_contents_" + str(counter))
                    if len(tmp_video_dict) > 0:
                        file_video_json = sys.stdout if filename == '-' else codecs.open(
                            filename_video_json, 'w', 'utf-8')
                        video_unit_dict = dict()
                        for vd in tmp_video_dict:
                            video_unit_dict.update({
                                "video_block_" + str(counter_video).zfill(2):
                                vd
                            })
                            counter_video += 1
                        video_dict2json = json.dumps(video_unit_dict,
                                                     sort_keys=False,
                                                     indent=4,
                                                     separators=(',', ': '))
                        file_video_json.writelines(video_dict2json)
                        file_video_json.close()
                        print(filename_video_json +
                              ' of video component was created')
                    counter += 1

    save_urls_to_file(
        prob_type_set,
        os.path.join(args.html_dir, coursename, "all_prob_type.txt"))
Exemplo n.º 6
0
def save_html_to_file(args, selections, all_urls, headers):
    sub_idx = 0
    prob_type_set = []
    counter_video = 1
    counter_unit = 1
    txt_id = 1
    prob_id = 1
    video_id = 1
    comp_id = 1
    tmp_course_strut = dict()
    txt_dict_ls = dict()
    prob_dict_ls = dict()
    comp_dict_ls = dict()
    video_dict_ls = dict()
    for selected_course, selected_sections in selections.items():
        coursename = directory_name(selected_course.name)
        sourcepath = os.path.join(args.html_dir, coursename,
                                  'source_html_file')
        mkdir_p(sourcepath)
        #filename_meta = os.path.join(sourcepath, 'html_metadata.csv')

        metasec_ls = [[], [], [], []]
        for selected_section in selected_sections:
            section_dirname = "%02d-%s" % (selected_section.position,
                                           selected_section.name)
            tmp_course_strut['section'] = (section_dirname)

            for subsection in selected_section.subsections:

                if subsection.name == None:
                    subsection.name = 'Untitled'

                tmp_course_strut['subsection'] = (subsection.name)
                #logging.info('url: '+ str(all_urls[sub_idx]) )
                print(all_urls[sub_idx])
                page = get_page_contents(str(all_urls[sub_idx]), headers)
                soup = BeautifulSoup(page, "html.parser")

                #div contains all units (seq_contents_#)
                main_content = soup.find("div", {"class": "container"})

                units = crawl_units(main_content)

                sub_idx = sub_idx + 1

                for idx, unit in enumerate(units):

                    filename_template = str(counter_unit).zfill(4) + ".html"
                    filename = os.path.join(args.html_dir, coursename,
                                            'source_html_file',
                                            filename_template)

                    try:
                        file_ = sys.stdout if filename == '-' else codecs.open(
                            filename, 'w', 'utf-8')
                    except IOError as exc:
                        f = open('downloading_error_report.txt', 'a')
                        text = 'External command error ignored: ' + str(
                            exc) + '\n\n'
                        f.write(text)
                        f.close()
                        file_ = sys.stdout if filename == '-' else codecs.open(
                            filename_template, 'w', 'utf-8')

                    file_.writelines(unit.prettify(formatter=None))
                    file_.close()

                    soup = unit.prettify(formatter=None)
                    soup = BeautifulSoup(soup, "html.parser")

                    cur_unit = soup.find("h2", {
                        "class": "hd hd-2 unit-title"
                    }).getText()
                    if cur_unit == None:
                        cur_unit = 'Untitled'
                    tmp_course_strut['unit'] = (cur_unit)

                    logging.info('section: ' + tmp_course_strut['section'])
                    logging.info('     subsection: ' +
                                 tmp_course_strut['subsection'])
                    logging.info('                unit: ' +
                                 tmp_course_strut['unit'])

                    metasec_ls[0].append(tmp_course_strut['section'])
                    metasec_ls[1].append(tmp_course_strut['subsection'])
                    metasec_ls[2].append(tmp_course_strut['unit'])
                    metasec_ls[3].append(filename_template)

                    # select only html componert (disregard video, problem)
                    html_flag = soup.findAll("div",
                                             {"data-block-type": "html"})
                    if len(html_flag) > 0:

                        #create file only when html component exists
                        text = ""
                        for soup_component in html_flag:
                            for s in soup_component.findAll([
                                    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p',
                                    'li'
                            ]):
                                text += s.getText() + " "

                        tmp_dict = {
                            'text_block_' + str(txt_id).zfill(4): {
                                'section': tmp_course_strut['section'],
                                'subsection': tmp_course_strut['subsection'],
                                'unit': tmp_course_strut['unit'],
                                'content': text
                            }
                        }
                        txt_dict_ls.update(tmp_dict)
                        txt_id += 1

                    # select only problem componert (disregard video, text)
                    prob_txt, prob_types = extract_problem_comp(soup)

                    if len(prob_txt) > 0:
                        for prob_type in prob_types:
                            prob_type_set.append(prob_type + ' \n')

                        tmp_dict = {
                            'quiz_block_' + str(prob_id).zfill(4): {
                                'section': tmp_course_strut['section'],
                                'subsection': tmp_course_strut['subsection'],
                                'unit': tmp_course_strut['unit'],
                                'content': prob_txt
                            }
                        }
                        prob_dict_ls.update(tmp_dict)
                        #print(tmp_dict)
                        prob_id += 1

                    tmp_video_dict = extract_video_component(
                        args, coursename, headers, soup,
                        tmp_course_strut['section'],
                        tmp_course_strut['subsection'],
                        tmp_course_strut['unit'])
                    if len(tmp_video_dict) > 0:
                        video_unit_dict = dict()
                        for vd in tmp_video_dict:
                            video_unit_dict.update({
                                "video_block_" + str(counter_video).zfill(4):
                                vd
                            })
                            counter_video += 1

                        video_dict_ls.update(video_unit_dict)
                        video_id += 1

                    print(video_dict_ls)

                    counter_unit += 1

                    set_comp_types = soup.findAll("div",
                                                  {"data-block-type": True})
                    for comp_type in set_comp_types:
                        if comp_type['data-block-type'] in [
                                'html', 'video', 'problem'
                        ]:
                            comp_dict = {
                                str(comp_id).zfill(4) + '_' + comp_type['data-block-type']:
                                {
                                    'section': tmp_course_strut['section'],
                                    'subsection':
                                    tmp_course_strut['subsection'],
                                    'unit': tmp_course_strut['unit'],
                                    'type': comp_type['data-block-type']
                                }
                            }
                            comp_dict_ls.update(comp_dict)
                            comp_id += 1

    txt_dict2json = json.dumps(txt_dict_ls,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': '))
    prob_dict2json = json.dumps(prob_dict_ls,
                                sort_keys=True,
                                indent=4,
                                separators=(',', ': '))
    video_dict2json = json.dumps(video_dict_ls,
                                 sort_keys=True,
                                 indent=4,
                                 separators=(',', ': '))
    comp_dict2json = json.dumps(comp_dict_ls,
                                sort_keys=True,
                                indent=4,
                                separators=(',', ': '))

    with open(os.path.join(args.html_dir, coursename, 'all_textcomp.json'),
              'w',
              encoding='utf-8') as f:
        f.write(txt_dict2json)

    with open(os.path.join(args.html_dir, coursename, 'all_probcomp.json'),
              'w',
              encoding='utf-8') as f:
        f.write(prob_dict2json)

    with open(os.path.join(args.html_dir, coursename, 'all_videocomp.json'),
              'w',
              encoding='utf-8') as f:
        f.write(video_dict2json)

    with open(os.path.join(args.html_dir, coursename, 'all_comp.json'),
              'w',
              encoding='utf-8') as f:
        f.write(comp_dict2json)

    metafile_dict = {
        'section': metasec_ls[0],
        'subsection': metasec_ls[1],
        'unit': metasec_ls[2],
        'htmlfile': metasec_ls[3]
    }
    df = pd.DataFrame.from_dict(metafile_dict)
    df.to_csv(
        os.path.join(args.html_dir, coursename, 'source_html_file',
                     'metadata.csv'))

    save_urls_to_file(
        prob_type_set,
        os.path.join(args.html_dir, coursename, "all_prob_type.txt"))
    make_tarfile(os.path.join(args.html_dir, coursename, 'sourcefile.tar.gz'),
                 os.path.join(args.html_dir, coursename, 'source_html_file'))
Exemplo n.º 7
0
def extract_video_component(args, coursename, headers, soup, section,
                            subsection, unit):

    video_flag = soup.findAll("div", {"data-block-type": "video"})
    video_meta_list = []
    for video_comp in video_flag:
        video_meta = dict()
        video = video_comp.find('div', {"data-metadata": True})
        txtjson = video['data-metadata']
        edx_video_id = video['id']
        txt2dict = json.loads(txtjson)
        start_time = txt2dict['start']
        yt_id = re.sub(r"1.00:", '', txt2dict['streams'])
        if len(txt2dict['streams']) == 0:
            duration = txt2dict['duration']
            yt_link = 'n/a'
            video_source = [i for i in txt2dict['sources']]
            if duration == 0:
                try:
                    duration = extract_duration_from_non_YT_video(
                        video_source[0], headers)
                except (HTTPError, URLError) as exception:
                    print('     bug: cannot download video from edx site')
                    duration = 'n/a'
            video_meta.update({
                'section': section,
                'subsection': subsection,
                'unit': unit,
                'youtube_url': yt_link,
                'video_source': video_source[0],
                'video_duration': duration,
                'video_id': edx_video_id,
                'start': start_time
            })
        else:
            yt_link = 'https://youtu.be/' + yt_id
            duration = videolen(yt_link)
            video_source = 'n/a'
            if duration == 0:
                duration = txt2dict['duration']
            video_meta.update({
                'section': section,
                'subsection': subsection,
                'unit': unit,
                'youtube_url': yt_link,
                'video_source': video_source,
                'video_duration': duration,
                'video_id': edx_video_id,
                'start': start_time
            })

        for key, value in txt2dict['transcriptLanguages'].items():
            transcript_name = 'transcript_' + key
            transcript_url = BASE_URL + '/' + re.sub(
                r"__lang__", key, txt2dict['transcriptTranslationUrl'])
            if yt_link == 'n/a':
                print('download ' + value + ' transcript of ' +
                      video_source[0])
            else:
                print('download ' + value + ' transcript of ' + yt_link)
            try:
                transcript_dump = get_page_contents(transcript_url, headers)
                transcript_raw = json.loads(transcript_dump)
                #print (transcript_raw)
                speech_period = extract_speech_period(transcript_raw['start'],
                                                      transcript_raw['end'])
                speech_times = extract_speech_times(transcript_raw['start'],
                                                    transcript_raw['end'])

                video_meta.update({
                    transcript_name: transcript_raw['text'],
                    'speech_period': speech_period,
                    'speech_times': speech_times
                })

            except (HTTPError, URLError) as exception:

                print('     bug: cannot download transcript from edx site')
                if yt_link == 'n/a':
                    video_meta.update({
                        transcript_name: {
                            "start": '',
                            "end": '',
                            "text": ''
                        },
                        'speech_period': 'n/a'
                    })
                    logging.warning('transcript (error: %s)', exception)
                    errorlog = os.path.join(args.html_dir, coursename,
                                            'transcript_error_report.txt')
                    f = open(errorlog, 'a')
                    text = '---------------------------------\n'\
                    + 'transcript error: ' + str(exception) +'\n' \
                    + 'video file: '+ video_source[0] +'\n' \
                    + 'language: ' + value + '\n' \
                    + 'section:  ' + section + '\n'\
                    + 'subsection: ' + subsection + '\n'\
                    + 'unit_idx: ' + unit + '\n' \
                    +'---------------------------------'
                    f.write(text)
                    f.close()
                    continue

                print('     attempt to download transcript on Youtube')
                transcript_raw = YT_transcript(yt_link, key)
                if len(transcript_raw) == 0:
                    print('     no transcript available on YouTube')
                    video_meta.update({
                        transcript_name: {
                            "start": '',
                            "end": '',
                            "text": ''
                        },
                        'speech_period': 'n/a'
                    })
                    logging.warning('transcript (error: %s)', exception)
                    errorlog = os.path.join(args.html_dir, coursename,
                                            'transcript_error_report.txt')
                    f = open(errorlog, 'a')
                    text = '---------------------------------\n'\
                    + 'transcript error: ' + str(exception) +'\n' \
                    + 'video url: '+ yt_link +'\n' \
                    + 'language: ' + value + '\n' \
                    + 'section:  ' + section + '\n'\
                    + 'subsection: ' + subsection + '\n'\
                    + 'unit_idx: ' + unit + '\n' \
                    +'---------------------------------'
                    f.write(text)
                    f.close()
                else:
                    print(
                        '     transcript was successfuly downloaded from YouTube'
                    )
                    speech_period = extract_speech_period(
                        transcript_raw['start'], transcript_raw['end'])
                    video_meta.update({
                        transcript_name: transcript_raw['text'],
                        'speech_period': speech_period
                    })

        video_meta_list.append(video_meta)
    return video_meta_list