def build_link_map(directory): # find all html files html_files = [] for root, dirnames, filenames in os.walk(directory): for filename in fnmatch.filter(filenames, '*.html'): html_files.append(os.path.join(root, filename)) link_map = LinkMap() for fn in html_files: f = open(fn, "r") text = f.read() f.close() m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text) if not m: continue text = m.group(1) text = re.sub('\s*', '', text) m = re.search('"wgPageName":"([^"]*)"', text) if not m: continue title = m.group(1) target = os.path.relpath(os.path.abspath(fn), os.path.abspath(directory)) link_map.add_link(title, target) return link_map