def createHTML(self, parent, token, page): headings = self.extension.binContent(page, token['location'], ContentExtension.FOLDER) links = self.extension.get('source_links') # Build lists for head in sorted(headings.keys()): items = headings[head] if head: h = html.Tag(parent, 'h{:d}'.format(int(token['level'])), class_='moose-a-to-z') if head in links: p = self.translator.findPage(links[head]) dest = p.relativeDestination(page) html.Tag(h, 'a', href=dest, string=str(head) + ' ') else: html.String(h, content=str(head)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(list(items), 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, path, _ in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=path, string=str(text.replace('.md', '')))
def _run(self, nodes, container, target, num_threads=1): """Helper function for running in parallel using Pipe""" # Create connection objects representing the receiver and sender ends of the pipe. receivers = [] random.shuffle(nodes) for chunk in mooseutils.make_chunks(nodes, num_threads): r, s = self._ctx.Pipe(False) receivers.append(r) self._ctx.Process(target=target, args=(chunk, s)).start() s.close() # need to close this instance because a copy was sent to the Process() object # Iterate through the list of ready connection objects, i.e., those that either have data to # receive or their corresponding sender connection has been closed, until all are removed # from the list of pending connections. If there is no data to receive and the sender has # been closed, then an EOFError is raised indicating that the receiver can be removed. while receivers: for r in [r for r in receivers if r.poll() or r.closed]: try: data = r.recv() except EOFError: receivers.remove(r) else: for uid, attributes, out in data: self._getPage(uid).attributes.update(attributes) if container is not None: container[uid] = out
def build(self, num_threads=multiprocessing.cpu_count()): """ Build all the pages in parallel. """ if self._root is None: raise mooseutils.MooseException( "The 'init' method must be called prior to build.") # Build the complete markdown file tree from the configuration supplied at construction if not isinstance(self._root, anytree.NodeMixin): raise TypeError( "The 'buildNodes' method must return a anytree.NodeMixin object." ) # Build the pages pages = list(self) jobs = [] for chunk in mooseutils.make_chunks(pages, num_threads): p = multiprocessing.Process(target=self.buildPages, args=(chunk, )) p.start() jobs.append(p) for job in jobs: job.join() self.copyFiles()
def _run(self, nodes, container, target, num_threads=1, prefix='Running'): """Helper function for running in parallel using Pipe""" # Time the process t = time.time() LOG.info('%s using %s threads...', prefix, num_threads) # Tokenization jobs = [] conn1, conn2 = self._ctx.Pipe(False) for chunk in mooseutils.make_chunks(nodes, num_threads): p = self._ctx.Process(target=target, args=(chunk, conn2)) p.start() jobs.append(p) while any(job.is_alive() for job in jobs): if conn1.poll(): for uid, attributes, out in conn1.recv(): for node in nodes: if uid == node.uid: node.attributes.update(attributes) break if container is not None: container[uid] = out LOG.info('Finished %s [%s sec.]', prefix, time.time() - t)
def execute(self, nodes, num_threads=1, read=True, tokenize=True, render=True, write=True): if read and not self._page_content: self._page_content = self._manager.dict({p.uid: None for p in self._page_objects}) if tokenize and not self._page_ast: self._page_ast = self._manager.dict({p.uid: None for p in self._page_objects}) if render and not self._page_result: self._page_result = self._manager.dict({p.uid: None for p in self._page_objects}) # Initialize a manager object dictionary with the current attributes of Page objects page_attributes = self._manager.dict({p.uid: p.attributes for p in self._page_objects}) # Distribute nodes to threads and process the execute methods on each jobs = [] random.shuffle(nodes) args = (self._ctx.Barrier(num_threads), page_attributes, read, tokenize, render, write) for chunk in mooseutils.make_chunks(nodes, num_threads): p = self._ctx.Process(target=self._target, args=(chunk, *args)) jobs.append(p) p.start() for job in jobs: job.join() # This is needed to maintain the page attributes during live serving. In parallel, when the # Executioner executes each process created above gets a copy of self._page_objects. Each # process is running the _target method and keeping the attributes of the pages up to date # across the processes. This call updates the attributes of the original pages that # were copied when the processes start. Thus, when new processes are started during a # live reload the attributes are correct when the copy is performed again for the new # processes. self._updateAttributes(page_attributes)
def execute(self, nodes, num_threads=1): """Perform the translation with multiprocessing.""" barrier = multiprocessing.Barrier(num_threads) manager = multiprocessing.Manager() page_attributes = manager.list([None] * len(self._page_objects)) # Initialize the page attributes container using the existing list of Page node objects for i in range(len(page_attributes)): Executioner.setMutable(self._page_objects[i], True) page_attributes[i] = self._page_objects[i].attributes Executioner.setMutable(self._page_objects[i], False) jobs = [] random.shuffle(nodes) for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self._target, args=(chunk, barrier, page_attributes)) jobs.append(p) p.start() for job in jobs: job.join() # This is needed to maintain the page attributes during live serving. In parallel, when the # Executioner executes each process created above gets a copy of self._page_objects. Each # process is running the _target method and keeping the attributes of the pages up to date # across the processes. This call updates the attributes of the original pages that # were copied when the processes start. Thus, when new processes are started during a # live reload the attributes are correct when the copy is performed again for the new # processes. self._updateAttributes(page_attributes)
def _run(self, nodes, container, target, num_threads=1, prefix='Running'): """Helper function for running in parallel using Pipe""" # Time the process t = time.time() LOG.info('%s using %s threads...', prefix, num_threads) # Tokenization jobs = [] conn1, conn2 = multiprocessing.Pipe(False) for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=target, args=(chunk, conn2)) p.start() jobs.append(p) while any(job.is_alive() for job in jobs): if conn1.poll(): data = conn1.recv() for uid, attributes, out in data: node = self._page_objects[uid] Executioner.setMutable(node, True) node.attributes.update(attributes) Executioner.setMutable(node, False) if container is not None: container[uid] = out LOG.info('Finished %s [%s sec.]', prefix, time.time() - t)
def createMaterialize(self, parent, token, page): # Initialized alphabetized storage headings = dict() for letter in 'ABCDEFGHIJKLNMOPQRSTUVWXYZ': headings[letter] = dict() # Extract headings, default to filename if a heading is not found func = lambda n: n.local.startswith(token['location']) and isinstance( n, pages.Source) for node in self.translator.findPages(func): h_node = heading.find_heading(self.translator, node) if h_node is not None: r = html.Tag(None, 'span') self.renderer.render(r, h_node, page) key = r.text() else: r = None key = node.name letter = key[0].upper() headings[letter][key] = node.relativeDestination(page) # Buttons buttons = html.Tag(parent, 'div', class_='moose-a-to-z-buttons') if not token['buttons']: buttons.parent = None # Build lists for letter, items in headings.iteritems(): id_ = uuid.uuid4() btn = html.Tag(buttons, 'a', string=unicode(letter), class_='btn moose-a-to-z-button', href='#{}'.format(id_)) if not items: btn.addClass('disabled') continue html.Tag(parent, 'h{}'.format(token['level']), class_='moose-a-to-z', id_=unicode(id_), string=unicode(letter)) row = html.Tag(parent, 'div', class_='row') links = [(text, href) for text, href in items.iteritems()] for chunk in mooseutils.make_chunks(links, 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, href in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=href, string=unicode(text))
def createMaterialize(self, parent, token, page): # Initialized alphabetized storage headings = dict() for letter in 'ABCDEFGHIJKLNMOPQRSTUVWXYZ': headings[letter] = dict() # Extract headings, default to filename if a heading is not found func = lambda n: n.local.startswith(token['location']) and isinstance(n, pages.Source) for node in self.translator.findPages(func): h_node = heading.find_heading(self.translator, node) if h_node is not None: r = html.Tag(None, 'span') self.renderer.render(r, h_node, page) key = r.text() else: r = None key = node.name letter = key[0].upper() headings[letter][key] = node.relativeDestination(page) # Buttons buttons = html.Tag(parent, 'div', class_='moose-a-to-z-buttons') if not token['buttons']: buttons.parent = None # Build lists for letter, items in headings.iteritems(): id_ = uuid.uuid4() btn = html.Tag(buttons, 'a', string=unicode(letter), class_='btn moose-a-to-z-button', href='#{}'.format(id_)) if not items: btn.addClass('disabled') continue html.Tag(parent, 'h{}'.format(token['level']), class_='moose-a-to-z', id_=unicode(id_), string=unicode(letter)) row = html.Tag(parent, 'div', class_='row') links = [(text, href) for text, href in items.iteritems()] for chunk in mooseutils.make_chunks(links, 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, href in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=href, string=unicode(text))
def execute(self, nodes, num_threads=1): """Perform parallel conversion using multiprocessing Pipe.""" if num_threads > len(nodes): num_threads = len(nodes) # Tokenization jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): conn1, conn2 = multiprocessing.Pipe(False) p = multiprocessing.Process(target=self.__tokenize_target, args=(chunk, conn2)) p.start() jobs.append((p, conn1, conn2)) # Finish the jobs and collect data from the Pipe while any(job[0].is_alive() for job in jobs): for job, conn1, conn2 in jobs: if conn1.poll(): uid = conn1.recv() if uid == ParallelPipe.PROCESS_FINISHED: conn1.close() job.join() continue self._tree_data[uid] = conn1.recv() self._meta_data[uid] = conn1.recv() self._ast_available = True # Rendering jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self.__render_target, args=(chunk, )) p.start() jobs.append(p) for job in jobs: job.join()
def execute(self, nodes, num_threads=1): """Perform parallel conversion using multiprocessing Pipe.""" if num_threads > len(nodes): num_threads = len(nodes) # Tokenization jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): conn1, conn2 = multiprocessing.Pipe(False) p = multiprocessing.Process(target=self.__tokenize_target, args=(chunk, conn2)) p.start() jobs.append((p, conn1, conn2)) # Finish the jobs and collect data from the Pipe while any(job[0].is_alive() for job in jobs): for job, conn1, conn2 in jobs: if conn1.poll(): uid = conn1.recv() if uid == ParallelPipe.PROCESS_FINISHED: conn1.close() job.join() continue self._tree_data[uid] = conn1.recv() self._meta_data[uid] = conn1.recv() self._ast_available = True # Rendering jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self.__render_target, args=(chunk,)) p.start() jobs.append(p) for job in jobs: job.join()
def execute(self, nodes, num_threads=1): """Perform the translation with multiprocessing.""" if num_threads > len(nodes): num_threads = len(nodes) jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self.__target, args=(chunk,)) p.start() jobs.append(p) for job in jobs: job.join()
def execute(self, nodes, num_threads=1): """Perform the translation with multiprocessing.""" barrier = multiprocessing.Barrier(num_threads) manager = multiprocessing.Manager() page_attributes = manager.list([None]*len(self._page_objects)) jobs = [] random.shuffle(nodes) for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self._target, args=(chunk, barrier, page_attributes)) jobs.append(p) p.start() for job in jobs: job.join()
def execute(self, nodes, num_threads=1, read=True, tokenize=True, render=True, write=True): page_attributes = self._manager.dict({p.uid: p.attributes for p in nodes}) # Distribute nodes to Barrier objects and run the _target() method on each. jobs = [] random.shuffle(nodes) args = (self._ctx.Barrier(num_threads), page_attributes, read, tokenize, render, write) for chunk in mooseutils.make_chunks(nodes, num_threads): p = self._ctx.Process(target=self._target, args=(chunk, *args)) jobs.append(p) p.start() for job in jobs: job.join() # The original copy of the 'self.page_objects' container needs to be updated to ensure that # the class instance retains this information over succesive invocations of this method. self._updateAttributes(page_attributes)
def createMaterialize(self, parent, token, page): headings = self.extension.binContent(page, token['location'], ContentExtension.FOLDER) # Build lists for head in sorted(headings.keys()): items = headings[head] if head: html.Tag(parent, 'h{:d}'.format(int(token['level'])), class_='moose-a-to-z', string=unicode(head)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(list(items), 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, path, _ in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=path, string=unicode(text.replace('.md', '')))
def execute(self, nodes, num_threads=1): """Perform the translation with multiprocessing.""" if num_threads > len(nodes): num_threads = len(nodes) if sys.version_info[0] == 2: barrier = mooseutils.parallel.Barrier(num_threads) else: barrier = multiprocessing.Barrier(num_threads) jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=self.__target, args=(chunk, barrier)) jobs.append(p) p.start() for job in jobs: job.join()
def createHTMLHelper(self, parent, token, page): # Initialized alphabetized storage headings = self.extension.binContent(page, token['location'], ContentExtension.LETTER) for letter in '0123456789abcdefghijklmnopqrstuvwxyz': if letter not in headings: headings[letter] = set() # Buttons buttons = html.Tag(parent, 'div', class_='moose-a-to-z-buttons') if not token['buttons']: buttons.parent = None # Build lists for letter in sorted(headings.keys()): items = headings[letter] id_ = uuid.uuid4() btn = html.Tag(buttons, 'a', string=str(letter.upper()), class_='btn moose-a-to-z-button', href='#{}'.format(id_)) if not items: btn.addClass('disabled') continue html.Tag(parent, 'h{:d}'.format(int(token['level'])), class_='moose-a-to-z', id_=str(id_), string=str(letter)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(list(items), 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, path, _ in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=path, string=str(text))
def createMaterialize(self, parent, token, page): location = token['location'] func = lambda p: p.local.startswith(location) and isinstance( p, pages.Source) nodes = self.translator.findPages(func) nodes.sort(key=lambda n: n.local) headings = collections.defaultdict(list) for node in nodes: key = tuple( node.local.replace(location, '').strip(os.sep).split(os.sep)) head = key[0] if len(key) > 1 else u'' headings[head].append((node.name, node.relativeDestination(page))) headings = [(h, items) for h, items in headings.iteritems()] headings.sort(key=lambda h: h[0]) # Build lists for head, items in headings: if head: html.Tag(parent, 'h{}'.format(token['level']), class_='moose-a-to-z', string=unicode(head)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(items, 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, href in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=href, string=unicode(text.replace('.md', '')))
def build(self, num_threads=multiprocessing.cpu_count()): """ Build all the pages in parallel. """ if self._root is None: raise mooseutils.MooseException("The 'init' method must be called prior to build.") # Build the complete markdown file tree from the configuration supplied at construction if not isinstance(self._root, anytree.NodeMixin): raise TypeError("The 'buildNodes' method must return a anytree.NodeMixin object.") # Build the pages pages = list(self) jobs = [] for chunk in mooseutils.make_chunks(pages, num_threads): p = multiprocessing.Process(target=self.buildPages, args=(chunk,)) p.start() jobs.append(p) for job in jobs: job.join() self.copyFiles()
def createMaterialize(self, parent, token, page): # Initialized alphabetized storage headings = self.extension.binContent(page, token['location'], ContentExtension.LETTER) for letter in 'abcdefghijklmnopqrstuvwxyz': if letter not in headings: headings[letter] = set() # Buttons buttons = html.Tag(parent, 'div', class_='moose-a-to-z-buttons') if not token['buttons']: buttons.parent = None # Build lists for letter, items in headings.iteritems(): id_ = uuid.uuid4() btn = html.Tag(buttons, 'a', string=unicode(letter.upper()), class_='btn moose-a-to-z-button', href='#{}'.format(id_)) if not items: btn.addClass('disabled') continue html.Tag(parent, 'h{:d}'.format(int(token['level'])), class_='moose-a-to-z', id_=unicode(id_), string=unicode(letter)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(list(items), 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, path, _ in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=path, string=unicode(text))
def createMaterialize(self, parent, token, page): headings = self.extension.binContent(page, token['location'], ContentExtension.FOLDER) # Build lists for head, items in headings.iteritems(): if head: html.Tag(parent, 'h{:d}'.format(int(token['level'])), class_='moose-a-to-z', string=unicode(head)) row = html.Tag(parent, 'div', class_='row') for chunk in mooseutils.make_chunks(list(items), 3): col = html.Tag(row, 'div', class_='col s12 m6 l4') ul = html.Tag(col, 'ul', class_='moose-a-to-z') for text, path, _ in chunk: li = html.Tag(ul, 'li') html.Tag(li, 'a', href=path, string=unicode(text.replace('.md', '')))
def execute(self, num_threads=1): """ Perform parallel build for all pages. Inputs: num_threads[int]: The number of threads to use (default: 1). NOTICE: A proper parallelization for MooseDocs would be three parallel steps, with minimal communication. 1. Read all the markdown files (in parallel). 2. Perform the AST tokenization (in parallel), then communicate the completed AST back to the main process. 3. Convert the AST to HTML (in parallel). 4. Write/copy (in parallel) the completed HTML and other files (images, js, etc.). However, step two is problematic because python requires that the AST be pickled, which is possible, for communication. In doing this I realized that the pickling was a limiting factor and made the AST step very slow. I need to investigate this further to make sure I was using a non-locking pool of workers, but this was taking too much development time. The current implementation performs all four steps together, which generally works just fine, with one exception. The autolink extension actually interrogates the AST from other pages. Hence, if the other page was generated off process the information is not available. The current implementation will just compute the AST locally (i.e., I am performing repeated calculations in favor of communication). This works well enough for now, but as more autolinking is preformed and other similar extensions are created this could cause a slow down. Long term this should be looked into again, for now the current approach is working well. This new system is already an order of 4 times faster than the previous implementation and likely could be optimized further. The multiprocessing.Manager() needs to be explored, it is working to pull the JSON index information together. """ common.check_type('num_threads', num_threads, int) self.__assertInitialize() # Log start message and time LOG.info("Building Pages...") start = time.time() manager = multiprocessing.Manager() array = manager.list() def target(nodes, lock): """Helper for building multiple nodes (i.e., a chunk for a process).""" for node in nodes: node.build() if isinstance(node, page.MarkdownNode): node.buildIndex(self.renderer.get('home', None)) with lock: for entry in node.index: array.append(entry) # Complete list of nodes nodes = [n for n in anytree.PreOrderIter(self.root)] # Serial if num_threads == 1: target(nodes, self.lock) # Multiprocessing else: jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=target, args=(chunk, self.lock)) p.start() jobs.append(p) for job in jobs: job.join() # Done stop = time.time() LOG.info("Build time %s sec.", stop - start) iname = os.path.join(self.destination, 'js', 'search_index.js') if not os.path.isdir(os.path.dirname(iname)): os.makedirs(os.path.dirname(iname)) items = [v for v in array if v] common.write(iname, 'var index_data = {};'.format(json.dumps(items)))
def assertChunk(self, n, gold): out = list(mooseutils.make_chunks(self.data, n)) self.assertEqual(out, gold)
def execute(self, num_threads=1): """ Perform parallel build for all pages. Inputs: num_threads[int]: The number of threads to use (default: 1). NOTICE: A proper parallelization for MooseDocs would be three parallel steps, with minimal communication. 1. Read all the markdown files (in parallel). 2. Perform the AST tokenization (in parallel), then communicate the completed AST back to the main process. 3. Convert the AST to HTML (in parallel). 4. Write/copy (in parallel) the completed HTML and other files (images, js, etc.). However, step two is problematic because python requires that the AST be pickled, which is possible, for communication. In doing this I realized that the pickling was a limiting factor and made the AST step very slow. I need to investigate this further to make sure I was using a non-locking pool of workers, but this was taking too much development time. The current implementation performs all four steps together, which generally works just fine, with one exception. The autolink extension actually interrogates the AST from other pages. Hence, if the other page was generated off process the information is not available. The current implementation will just compute the AST locally (i.e., I am performing repeated calculations in favor of communication). This works well enough for now, but as more autolinking is preformed and other similar extensions are created this could cause a slow down. Long term this should be looked into again, for now the current approach is working well. This new system is already an order of 4 times faster than the previous implementation and likely could be optimized further. The multiprocessing.Manager() needs to be explored, it is working to pull the JSON index information together. """ common.check_type('num_threads', num_threads, int) self.__assertInitialize() self.renderer.preExecute() # Log start message and time LOG.info("Building Pages...") start = time.time() manager = multiprocessing.Manager() array = manager.list() build_index = isinstance(self.renderer, MaterializeRenderer) def target(nodes, lock): """Helper for building multiple nodes (i.e., a chunk for a process).""" for node in nodes: node.build() if isinstance(node, page.MarkdownNode): if build_index: node.buildIndex(self.renderer.get('home', None)) with lock: for entry in node.index: array.append(entry) # Complete list of nodes nodes = [n for n in anytree.PreOrderIter(self.root)] # Serial if num_threads == 1: target(nodes, self.lock) # Multiprocessing else: jobs = [] for chunk in mooseutils.make_chunks(nodes, num_threads): p = multiprocessing.Process(target=target, args=(chunk, self.lock)) p.start() jobs.append(p) for job in jobs: job.join() # Done stop = time.time() LOG.info("Build time %s sec.", stop - start) if build_index: iname = os.path.join(self.get('destination'), 'js', 'search_index.js') if not os.path.isdir(os.path.dirname(iname)): os.makedirs(os.path.dirname(iname)) items = [v for v in array if v] common.write(iname, 'var index_data = {};'.format(json.dumps(items))) self.renderer.postExecute()