def run_modules(self): ag = int(self.gv.settings.args['--aggression']) self.debug.print_debug( self, u'Running at aggression level {0} {1}'.format( ag, "[grrr!]" if ag == 10 else "")) if ag > 10: self.debug.print_debug( self, "WARNING: safety bail-out features are disabled at aggression level 11" ) if self.args['bibscan']: BibliographyDatabase(self.gv).scan() else: # check for stylesheets self.gv.check_file_exists(self.gv.docx_style_sheet_dir) # metadata file gv.metadata_file = self.set_metadata_file() self.gv.mk_dir(self.gv.output_folder_path) if self.args['doc']: # run doc to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('doc') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['odt']: # run odt to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('odt') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['other']: # run other unoconv-supported format to docx conversion # then run docx to tei UnoconvToDocx(self.gv).run('unoconv') DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['docx']: # run docx to tei conversion # includes hooks for proprietary transforms if enabled DocxToTei(self.gv).run(True, self.args['--proprietary']) elif self.args['docxextracted']: self.debug.print_debug(self, u'Skipping docx extraction') DocxToTei(self.gv).run(False, self.args['--proprietary']) elif self.args['tei']: self.debug.print_debug( self, u'Skipping docx extraction; processing TEI file') DocxToTei(self.gv).run(False, self.args['--proprietary'], tei=True) if self.args['--puretei']: self.debug.print_debug(self, u'Exiting as TEI transform complete') return metadata = Metadata(self.gv) metadata.pre_clean() # run size classifier # aggression 5 SizeClassifier(self.gv).run() # run bibliographic addins handler # aggression 4 found_bibliography = BibliographyAddins(self.gv).run() # run list classifier # aggression 4 ListClassifier(self.gv).run() bibliography_classifier = BibliographyClassifier(self.gv) if not found_bibliography: # run bibliographic classifier # aggression 4 bibliography_classifier.run() # tei # aggression 3 TeiManipulate(self.gv).run() # run tei to nlm conversion TeiToNlm(self.gv).run(not found_bibliography) if self.gv.settings.args['--purenlm']: self.debug.print_debug(self, u'Exiting as NLM transform complete') return manipulate = NlmManipulate(self.gv) if not self.gv.used_list_method: manipulate.fuse_references() # run reference linker if not (self.args['--nolink']): rl = ReferenceLinker(self.gv) rl.run(self.args['--interactive']) rl.cleanup() # run table classifier cc = CaptionClassifier(self.gv) if int(self.args['--aggression']) > int( self.gv.settings.get_setting( 'tablecaptions', self, domain='aggression')): cc.run_tables() if int(self.args['--aggression']) > int( self.gv.settings.get_setting( 'graphiccaptions', self, domain='aggression')): cc.run_graphics() # run metadata merge metadata.run() if self.args['--interactive']: bibliography_classifier.run_prompt(True) # process any bibliography entries that are possible BibliographyDatabase(self.gv).run() # remove stranded titles and cleanup manipulate.final_clean() if self.args['--identifiers']: IdGenerator(self.gv).run() if self.args['--chain']: # construct and run an XSLT chainer XslChain(self.gv).run() if self.args['--clean']: ComplianceEnforcer(self.gv).run()
def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = [ '{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body' ] head_elements = tree.xpath( '//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug( self, u'Extracted {0} headings from inside invalid elements'.format( count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath( '//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath( 'tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug( self, u'Separated out p {0}'.format( manipulate.get_stripped_text(parent)))
def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = [ '{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit' ] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be author metadata'. format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug( self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata' .format(text)) manipulate.save_tree(tree)