forked from appsembler/msft-courses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
importer.py
199 lines (150 loc) · 5.84 KB
/
importer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import json
from glob import glob
import subprocess
from os import path, walk, mkdir, makedirs, listdir
import fileinput
import yaml
import sys
import shutil
import tarfile
from bs4 import BeautifulSoup
from django.conf import settings
from django.contrib.auth.models import User
from xmodule.modulestore import ModuleStoreEnum
from django_comment_common.utils import (
seed_permissions_roles,
are_permissions_roles_seeded,
)
from opaque_keys.edx.keys import CourseKey
from opaque_keys.edx.locator import LibraryLocator
from xmodule.contentstore.django import contentstore
from xmodule.modulestore.django import modulestore
from xmodule.modulestore.xml_importer import (
import_course_from_xml,
import_library_from_xml
)
MOD_STORE = modulestore()
DATA_DIR = '/edx/var/edxapp/data' # settings.GITHUB_REPO_ROOT
WORK_TMP_DIR = '/tmp/courses-workdir'
ZIP_EXTRACT_DIR = path.join(WORK_TMP_DIR, 'zip_dest')
XML_EXTRACT_DIR = path.join(WORK_TMP_DIR, 'xml_root')
def _get_courses_dir():
"""
This is set via `import.sh` at run time.
"""
return COURSES_DIR
def _read_file_in_tgz(filename, sub_filename):
with tarfile.open(filename, 'r:gz') as tgz:
file_obj = tgz.extractfile(sub_filename)
if file_obj:
return file_obj.read()
def _is_library_file(filename):
try:
return bool(_read_file_in_tgz(filename, 'library/library.xml'))
except KeyError:
return False
def _filename_to_id_and_run(filename):
basename = path.basename(filename).replace('.tar.gz', '')
parts = basename.split('-')
course_id, run = parts[-2:] # Microsoft course file naming convention
return course_id.strip(), run.strip()
def cleanup():
if path.exists(WORK_TMP_DIR):
shutil.rmtree(WORK_TMP_DIR)
mkdir(WORK_TMP_DIR)
mkdir(XML_EXTRACT_DIR)
mkdir(ZIP_EXTRACT_DIR)
def extract_zip_courses():
for parent, _dirs, files in walk(_get_courses_dir()):
for zipfile in files:
if zipfile.endswith('.zip'):
subprocess.call(['unzip', path.join(parent, zipfile), '-d', ZIP_EXTRACT_DIR])
def get_importable_files(get_courses=False):
"""
get_courses ^ _is_library_file(path):
GT_CRS IS_LIB XOR
0 0 0
0 1 1
1 0 1
1 1 0
:param get_courses:
:return:
"""
for courses_dir in [ZIP_EXTRACT_DIR, _get_courses_dir()]:
for parent, _dirs, files in walk(courses_dir):
for lib_file in files:
if lib_file.endswith('.tar.gz'):
if get_courses ^ _is_library_file(path.join(parent, lib_file)):
yield path.join(parent, lib_file)
def _fix_library_source_bug(course_xml_dir):
libraries_dir = path.join(course_xml_dir, 'course/library_content/')
if not path.exists(libraries_dir):
return
for library_file in listdir(libraries_dir):
library_file = path.join(libraries_dir, library_file)
with open(library_file, 'r') as library_f:
lib_xml = BeautifulSoup(library_f.read(), 'lxml')
lib_element = lib_xml.library_content
with open(library_file, 'w') as library_f:
del lib_element['source_library_version']
library_f.write(str(lib_element))
def import_single_course(filename):
print >> sys.stderr, 'IMPORTING course:', filename
course_id, course_run = _filename_to_id_and_run(filename)
course_full_id = 'course-v1:Microsoft+{id}+{run}'.format(
id=course_id,
run=course_run
)
course_xml_dir = path.join(XML_EXTRACT_DIR, '{id}-{run}'.format(id=course_id, run=course_run))
mkdir(course_xml_dir)
subprocess.call(['tar', '-xzf', filename, '-C', course_xml_dir])
_fix_library_source_bug(course_xml_dir)
print >> sys.stderr, 'IMPORTING course:', course_full_id
course_items = import_course_from_xml(
store=MOD_STORE,
user_id=ModuleStoreEnum.UserID.mgmt_command,
data_dir=DATA_DIR,
source_dirs=[path.join(course_xml_dir, 'course')], # Open edX needs `course` dir
load_error_modules=False,
static_content_store=contentstore(),
verbose=True,
do_import_static=True,
target_id=CourseKey.from_string(course_full_id),
create_if_not_present=True,
)
for course in course_items:
course_id = course.id
if not are_permissions_roles_seeded(course_id):
print >> sys.stderr, 'Seeding forum roles for course', course_id
seed_permissions_roles(course_id)
def import_single_library(filename):
print >> sys.stderr, 'IMPORTING library:', filename
no_extension = path.basename(filename).replace('.tar.gz', '')
library_xml_dir = path.join(XML_EXTRACT_DIR, no_extension)
mkdir(library_xml_dir)
subprocess.call(['tar', '-xzf', filename, '-C', library_xml_dir])
with open(path.join(library_xml_dir, 'library/library.xml')) as lib_xml_file:
lib_xml = BeautifulSoup(lib_xml_file.read())
lib_element = lib_xml.find('library')
target_id = LibraryLocator(org=str(lib_element['org']), library=str(lib_element['library']))
print >> sys.stderr, 'IMPORTING library:', target_id
import_library_from_xml(
store=MOD_STORE,
user_id=ModuleStoreEnum.UserID.mgmt_command,
data_dir=DATA_DIR,
source_dirs=[path.join(library_xml_dir, 'library')], # Open edX needs `library` dir
load_error_modules=False,
static_content_store=contentstore(),
verbose=True,
do_import_static=True,
target_id=target_id,
create_if_not_present=True,
)
def run():
cleanup()
extract_zip_courses()
for library_file in get_importable_files(get_courses=False):
import_single_library(library_file)
for course_filename in get_importable_files(get_courses=True):
import_single_course(course_filename)
run()