class CassiusImport(Debuggable): def __init__(self): # read command line arguments self.args = self.read_command_line() # absolute first priority is to initialize debugger so that anything triggered here can be logged self.debug = Debug() Debuggable.__init__(self, 'cassius-import') self.in_file = self.args['<in-file>'] self.out_file = self.args['<out-file>'] self.dir = os.path.dirname(os.path.abspath(__file__)) if self.args['--debug']: self.debug.enable_debug() self.debug.enable_prompt(Interactive(self.args['--debug'])) @staticmethod def read_command_line(): return docopt(__doc__, version='cassius-import v0.1') def run(self): command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format( self.dir, os.sep, self.out_file, self.in_file) #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file) #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml' self.debug.print_debug(self, u'Running saxon transform (JATS -> CaSSius)') subprocess.call(command, stdin=None, shell=True)
class CassiusImport (Debuggable): def __init__(self): # read command line arguments self.args = self.read_command_line() # absolute first priority is to initialize debugger so that anything triggered here can be logged self.debug = Debug() Debuggable.__init__(self, 'cassius-import') self.in_file = self.args['<in-file>'] self.out_file = self.args['<out-file>'] self.dir = os.path.dirname(os.path.abspath(__file__)) if self.args['--debug']: self.debug.enable_debug() self.debug.enable_prompt(Interactive(self.args['--debug'])) @staticmethod def read_command_line(): return docopt(__doc__, version='cassius-import v0.1') def run(self): command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file) #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file) #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml' self.debug.print_debug(self, u'Running saxon transform (JATS -> CaSSius)') subprocess.call(command, stdin=None, shell=True)
class Disseminate(Debuggable): def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.settings = Settings(self.args) self.gv = GV(self.settings) Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.dr = self.args.get('<path>') self.f = self.args.get('<input_file>') self.out_type = self.args.get('--out-type').lower() self.script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='Disseminate 0.1') def get_saxon_path(self): """Checks if saxon is available in the default path Returns -------- saxon : boolean True, if saxon is available. False, if not. """ s = os.path.join(self.script_path, self.gv.apps.get('saxon')) if os.path.isfile(s): return s elif self.args.get('--saxon'): if os.path.isfile(self.args.get('--saxon')): return self.args.get('--saxon') else: return False else: return False def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'disseminate' return name def process(self, args): """Runs typesetter with given arguments Creates the execution path for the conversion process. Output,exit-code and system error codes are captured and returned. Parameters ---------- args : list application arguments in the correct oder. Returns ------- output :str system standard output. err :str system standard error. exit_code: str system exit_code. See Also -------- subprocess.Popen() """ m = ' '.join(args).strip().split(' ') print(' '.join(args)) process = Popen(m, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code == 1: print(err) sys.exit(1) return output, err, exit_code def run(self): """ Runs converters See Also -------- create_output, create_pdf """ self.create_output(self.out_type) def create_output(self, out_type): """ Create FO output Parameters ---------- out_type: str Output Type See Also ------- run_saxon(), get_saxon_path() """ formatters = self.args.get('--formatter').split(',') mediums = self.args.get('--medium').split(',') for f in formatters: f = f.lower() for m in mediums: m = m.lower() self.gv.create_dirs_recursive(self.args.get('<path>').split(os.pathsep)) if self.out_type=='fo': self.debug.print_console(self, self.gv.RUNNING_FO_CONVERSION) saxon_path = self.get_saxon_path() args = self.run_saxon(saxon_path,f, m) if self.out_type=='pdf': self.debug.print_console(self, self.gv.RUNNING_PDF_CONVERSION) args = self.run_fop_processor(f, m) output, err, exit_code = self.process(args) def run_fop_processor(self, formatter, medium): args = [] if formatter.lower() == 'fop': pth = os.path.join(self.script_path, self.gv.apps.get('fop')) if self.gv.check_program(pth): args = self.run_apache_fop(pth,formatter, medium) elif formatter.lower() == 'ah': pth = self.gv.apps.get('ah') if self.gv.check_program(pth): args = self.run_ah_fop(pth,formatter, medium) return args def run_ah_fop(self, pth, formatter, medium): args=[pth] args.append('-d') args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f), self.gv.uuid, formatter, medium)) args.append('-o') args.append('{}/{}.{}.{}.pdf'.format(self.dr, self.gv.uuid, formatter, medium)) return args def run_apache_fop(self, pth, formatter, medium): style_path = '{}/configurations/fop/conf/{}.{}.xml'.format(self.script_path, formatter,medium) args = [pth] args.append('-fo') args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f),self.gv.uuid, formatter, medium)) args.append('-pdf') args.append('{}/{}.{}.{}.pdf'.format(self.dr,self.gv.uuid, formatter, medium)) args.append('-c') args.append(style_path) return args def run_saxon(self, saxon_path, formatter, medium): """ Creates the executable path for saxon Parameters --------- saxon_path : str absolute path of the saxon binary jar file formatter : str name of the FO formatter medium : str name of the medium Returns ------ args:list List of arguments for saxon execution path """ args = ["java", "-jar", saxon_path] if self.args.get('--xsl'): xsl = self.script_path.split(os.sep) xsl.append('stylesheets') xsl.append(self.args.get('--xsl')) args.append("-xsl:" + os.sep.join(xsl)) s = self.args.get('<input_file>') if os.path.exists(s): args.append("-s:" + s) else: self.debug.print_debug(self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + s) sys.exit(1) file_name = '.'.join([self.gv.uuid,formatter.lower(),medium.lower(),'fo']) args.append("-o:" + os.path.join(self.args.get('<path>'), file_name)) args.append('formatter=' + formatter.lower()) args.append('medium=' + medium.lower()) return args
class Merge(Debuggable): """ Standalone Processing object which merges current JATS/BITS XML file in to the Body of a BITS-XML document. """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.settings = Settings(self.args) self.gv = GV(self.settings) self.dr = self.args.get('<path>') self.f = self.args.get('<input_file>') self.scheme = self.args.get('<scheme>') self.set_numbering_tags = self.args.get('--set-numbering-tags') self.tr = etree.parse(os.path.join(self.dr, self.f)) Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='xmlMerge 0.0.1') def create_output_bits(self): """ Create bits output file, generates a new file, if no file is found. Otherwise the current file is appended to the book body as a book-part. See Also -------- create_book_part_bits, create_book_bits, do_file_io """ fuf = os.path.join(self.dr, self.gv.uuid) pt = os.path.join(self.dr, os.path.basename(self.gv.uuid)) trf = None if os.path.isfile(fuf): trf = etree.parse(fuf) bp = trf.find(".//book-body") book_part = self.create_book_part_bits() bp.append(book_part) else: trf = self.create_book_bits() trf = self.process(trf) self.do_file_io( etree.tostring(trf, pretty_print=False, xml_declaration=True, encoding='UTF-8', standalone='yes'), 'w', pt) def create_output_jats(self): """ Create jats output file, generates a new file, See Also -------- create_book_part_bits, create_book_bits, do_file_io """ fuf = os.path.join(self.dr, self.gv.uuid) pt = os.path.join(self.dr, os.path.basename(self.gv.uuid)) trf = None if os.path.isfile(fuf): trf = etree.parse(fuf) bpf = trf.find(".//body") f, bd, bk = self.get_xml_parts() if bd is not None: for sec in list(bd): bpf.append(sec) bkrf = trf.find(".//back/ref-list") for r in bk.findall('.//ref-list/ref'): bkrf.append(r) bkff = trf.find(".//back/fn-group") for fn in bk.findall('.//fn-group/fn'): bkff.append(fn) else: trf = self.create_journal_jats() trf = self.process(trf) self.do_file_io( etree.tostring(trf, pretty_print=False, xml_declaration=True, encoding='UTF-8', standalone='yes'), 'w', pt) def process(self, tr): """ Process BITS-XML file and do all transformations into the elementtree Parameters ---------- tr : elementtree element tree as input Returns ------- tr : elementtree transformed element tree See Also -------- globals.set_numbering_tags(), set_book_part_attributes() """ tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(','), tr) if self.set_numbering_tags else tr self.set_book_part_attributes(tr) return tr def set_book_part_attributes(self, tr): """ Add specific attributes to book-part Parameters ---------- tr : elementtree element tree as input Returns ------- tr : elementtree transformed element tree """ book_parts = tr.findall('.//book-part') for i, b in enumerate(book_parts): b.attrib['id'] = "ch_" + str(i) b.attrib['book-part-type'] = "chapter" return tr def create_metadata_path(self, metadata): """ creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata Parameters ---------- metadata : str Suffix of the metadata files Returns ------- pth : str Correct path of the metadata file in the folder structure Notes ----- We assume that metadata files are stored in a sub-folder named metadata """ p = os.path.dirname(self.f).split(os.sep) del p[-4:] name, ext = os.path.splitext(os.path.basename(self.gv.uuid)) file_name = [name, '.', metadata, '.', 'xml'] p.append('metadata') p.append(''.join(file_name)) pth = os.sep.join(p) self.debug.print_debug(self, 'merging headers' + str(pth)) return pth def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'merge' return name def create_book_bits(self): """ creates a full BITS XML book and optionally adds metadata Returns ------- book : elementtree Elementtree which complies to BITS XML Scheme. See Also --------- create_metadata_path, create_book_part_bits """ nsmap = { 'xlink': "http://www.w3.org/1999/xlink", 'mml': "http://www.w3.org/1998/Math/MathML", "xml": "http://www.w3.org/XML/1998/namespace" } book = etree.Element(etree.QName('book'), nsmap=nsmap) book.attrib['dtd-version'] = "2.1" book.attrib[etree.QName( '{http://www.w3.org/XML/1998/namespace}lang')] = "de" book.attrib['book-type'] = "proceedings" metadata = self.args.get('--metadata') if metadata: pth = self.create_metadata_path(metadata) self.debug.print_console(self, 'merging headers' + str(pth)) if os.path.isfile(pth): bp = etree.parse(pth).find('.//book-meta') book.insert(0, bp) else: self.debug.print_console( self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + str(pth)) #sys.exit(1) else: sys.exit('Metadata argument undefined') bd = etree.Element("book-body") bpbd = self.create_book_part_bits() bd.append(bpbd) book.append(bd) return book def create_journal_jats(self): """ creates a full JATS XML book and optionally adds metadata Returns ------- book : elementtree Elementtree which complies to BITS XML Scheme. See Also --------- create_metadata_path, create_book_part_bits """ nsmap = { 'xlink': "http://www.w3.org/1999/xlink", 'mml': "http://www.w3.org/1998/Math/MathML", "xml": "http://www.w3.org/XML/1998/namespace" } journal = etree.Element(etree.QName('article'), nsmap=nsmap) journal.attrib['dtd-version'] = "3.0" journal.attrib[etree.QName( '{http://www.w3.org/XML/1998/namespace}lang')] = "de" f, bd, bk = self.get_xml_parts() metadata = self.args.get('--metadata') if metadata: pth = self.create_metadata_path(metadata) if os.path.isfile(pth): bpm = etree.parse(pth).find('.') if bpm is not None: if bpm.getroottree().getroot().tag == 'front': journal.insert(0, bpm) else: self.debug.print_debug(self, 'front metadata unspecified') sys.exit(1) else: journal.insert(0, f) journal.append(bd) if len(bk) > 0: journal.append(bk) else: back = etree.Element(etree.QName('back')) back.append(etree.Element(etree.QName('fn-group'))) back.append(etree.Element(etree.QName('ref-list'))) journal.append(back) return journal def create_book_part_bits(self): """ Reads a JATS XMl File and creates a book-part element tree according to BITS-XML. Returns ------- bp : elementtree Book part elementTree """ f, bd, bk = self.get_xml_parts() bp = etree.Element("book-part") if f is not None: if len(f): bp.append(f) if bd is not None: bp.append(bd) if bk is not None: bp.append(bk) return bp def get_xml_parts(self): """ Returns the front-matter , body and back-matter of a JATS XML file in the above order Returns ------- f : elementtree Front-matter of JATS elementTree bd : elementtree Body of JATS elementTree bk : elementtree Back-matter of JATS elementTree """ r = self.tr.getroot() f = r.find(".//front") if f is None: f = r.find(".//book-part-meta") bd = r.find(".//body") bk = r.find(".//back") return f, bd, bk def do_file_io(self, s, mode, pth): """ Executes read or write operations on a path Parameters ---------- s: str Content to be written or None for read mode: str w for write , r for r pth : str Path to the file to be read or written Raises ------ IOError I/O operation fails """ try: w = open(pth, mode) if mode == 'w': w.write(s.rstrip('\r\n')) w.close() if mode == 'r': o = w.read() w.close() except IOError as i: self.debug.print_debug(self, i) print(i) sys.exit(1) def run(self): """ Runs the configuration on the processing object. Process JATS-XML file and merges it into the full BITS-XML file See Also -------- create_output_bits Warning ------- function create_output_jats not yet used """ self.gv.create_dirs_recursive(self.dr.split('/')) if self.scheme == 'bits': self.create_output_bits() elif self.scheme == 'jats': self.tr = self.create_output_jats()
class GV(object): ''' Global variables ''' def __init__(self, settings): # GLOBAL VARIABLES self.settings = settings #application paths self.apps = {'fop': 'fop/fop/fop', 'saxon': 'tools/meTypeset/runtime/saxon9.jar', 'ah': '/usr/AHFormatterV65_64/run.sh', 'xep': '/usr/local/xep/bin/xep/xep' } # projects self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID = 'project input file json is not valid' self.PROJECT_INPUT_FILE_TYPE_IS_NOT_SPECIFIED = 'project input file type is not specified' self.PROJECT_INPUT_FILE_HAS_MORE_THAN_TWO_DOTS = 'project input file has more than two dots' self.PROJECT_INPUT_FILE_DOES_NOT_EXIST = 'project input_file does not exist' self.PROJECT_IS_NOT_ACTIVE = 'project is not active' self.PROJECT_OUTPUT_FILE_IS_NOT_DEFINED = 'project output file is not defined' self.PROJECT_OUTPUT_FILE_TYPE_IS_NOT_SPECIFIED = 'project output file type is not defined' self.PROJECT_OUTPUT_FILE_WAS_NOT_CREATED = 'project output file was not created' self.PROJECT_TYPESETTER_IS_NOT_AVAILABLE = 'project typesetter is not available' self.PROJECT_TYPESETTER_IS_NOT_SPECIFIED = 'project typesetter is not specified' self.PROJECT_TYPESETTER_NAME_IS_NOT_SPECIFIED = 'project typesetter name is not specified' self.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED = 'project typesetter varaible is not specified' self.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED = 'project typesetters are not specified' self.PROJECTS_VAR_IS_NOT_SPECIFIED = 'project variable is not specified' self.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED='project typesetter process method not specified' self.PROJECTS_TYPESETTER_RUNS_WITH_NO_ARGUMENTS = 'projects typesetter runs with no arguments' # typesetter errors self.TYPESETTER_ARGUMENTS_NOT_DEFINED = "typesetter arguments not defined" self.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED = 'typesetter executable variable is undefined' self.TYPESETTER_FILE_OUTPUT_TYPE_IS_UNDEFINED = 'typesetter file output type is undefined' self.TYPESETTER_METADATA_FILE_WAS_NOT_SPECIFIED = 'Metadata file wasn\'t specified ' self.TYPESETTER_METYPESET_RUNS_WITH_DEFAULT_METADATA_FILE = 'typesetter metypeset runs with default metadata file' self.TYPESETTER_IS_NOT_SPECIFIED = 'typesetter is not specified ' self.TYPESETTER_PATH_IS_NOT_SPECIFIED = 'typesetter path is not specified ' self.TYPESETTER_BINARY_IS_UNAVAILABLE = 'typesetter binary is unavailable ' self.TYPESETTER_RUNS_WITH_NO_ARGUMENTS = 'typesetter runs with no arguments' # xml self.RUNNING_FO_CONVERSION = 'running FO conversion' self.RUNNING_PDF_CONVERSION = 'running PDF conversion' self.XML_ELEMENT_NOT_FOUND = 'xml element not found' self.XML_FILE_NOT_CREATED = 'xml file not created' self.XML_INPUT_FILE_IS_NOT_FOUND = 'xml input file is not found' self.XML_INPUT_FILE_IS_NOT_VALID = 'xml input file is not valid' self.SAXON_IS_NOT_AVAILABLE = 'saxon is not available' self.FOP_PATH_IS_NOT_AVAILABLE='fop path is not available' # WORDS self.OUTPUT = 'Output' self.debug = Debug() self.numeral_map = numeral_map #LOG Object self.log= [] self.uuid = 'mpt' self.version = '0.0.1' @staticmethod def fatal_error(module, message): """ Prints a formatted error message and exits Parameters ---------- module: python module Returns the name of the module message: str Error message See Also -------- module.get_module_name() """ print(('[FATAL ERROR] [{0}] {1}'.format( module.get_module_name(), message))) sys.exit(1) def is_json(self, s): """ Checks whether a string is valid json string Parameters ---------- s : str JSON data as string Raises ------ ValueError error Inappropriate json string """ try: return json.loads(s) except ValueError as e: return False return True def read_json(self, pth): """ Reads a json file from system path or exits Parameters ---------- pth: str path of the file in the folder structure Returns ------- json : json json object """ if os.path.isfile(pth): with open(pth) as j: return json.load(j) else: try: r = requests.get(pth, verify=False, stream=True) if r.status_code==200: return r.json() else: self.debug.print_debug(self, self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID) sys.exit(1) except requests.exceptions.ConnectionError as ce: self.debug.print_debug(self, str(ce.message)) sys.exit(1) def create_dirs_recursive(self, pth): """ Recursively create directories for a system path or exists if folder exists Parameters ---------- pth : str system path to be created """ p = '' for path in pth: p = p + os.path.sep + path.strip('/').strip('/') if not os.path.exists(p): try: os.makedirs(p) except OSError as o: print(o) sys.exit(1) return p def set_numbering_tags(self, tags, tr): """ Automatic numbering of the list of elements Parameters ---------- tags: list list of elements Returns ------- tr : elementtree """ for tag in tags: sh = tr.findall('.//' + tag) sid = 1 for i in sh: i.set('id', tag.replace('-', '') + str(sid)) sid += 1 return tr def check_program(self, p): """ Checks whether a the program or typesetter is installed and executable Parameters --------- p: str Program path Returns -------- None: bool Returns None , if program exists """ def is_exe(f_path): """ Checks whether path is available and executable Parameters --------- f_path: str File path Returns -------- boolean: bool True or False """ return os.path.isfile(f_path) and os.access(f_path, os.X_OK) fpath, fname = os.path.split(p) if fpath: if is_exe(p): return p else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, p) if is_exe(exe_file): return exe_file return None
class Disseminate(Debuggable): def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.gv = GV() Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.dr = self.args.get('<path>') self.f = self.args.get('<input_file>') self.out_type = self.args.get('--out-type').lower() self.script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='Disseminate 0.1') def get_saxon_path(self): """Checks if saxon is available in the default path Returns -------- saxon : boolean True, if saxon is available. False, if not. """ s = os.path.join(self.script_path, self.gv.METYPESET_PATH) if os.path.isfile(s): return s elif self.args.get('--saxon'): if os.path.isfile(self.args.get('--saxon')): return self.args.get('--saxon') else: return False else: return False def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'OUTPUT Generation' return name def process(self, args): """Runs typesetter with given arguments Creates the execution path for the conversion process. Output,exit-code and system error codes are captured and returned. Parameters ---------- args : list application arguments in the correct oder. Returns ------- output :str system standard output. err :str system standard error. exit_code: str system exit_code. See Also -------- subprocess.Popen() """ m = ' '.join(args).strip().split(' ') print ' '.join(args) process = Popen(m, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code == 1: print err sys.exit(1) return output, err, exit_code def run(self): """ Runs converters See Also -------- create_output, create_pdf """ self.create_output(self.out_type) def create_output(self, out_type): """ Create FO output Parameters ---------- out_type: str Output Type See Also ------- run_saxon(), get_saxon_path() """ formatters = self.args.get('--formatter').split(',') mediums = self.args.get('--medium').split(',') for f in formatters: f = f.lower() for m in mediums: m = m.lower() self.gv.create_dirs_recursive(self.args.get('<path>').split(os.pathsep)) if self.out_type=='fo': self.debug.print_console(self, self.gv.RUNNING_FO_CONVERSION) saxon_path = self.get_saxon_path() args = self.run_saxon(saxon_path,f, m) if self.out_type=='pdf': self.debug.print_console(self, self.gv.RUNNING_PDF_CONVERSION) args = self.run_fop_processor(f, m) output, err, exit_code = self.process(args) print output def run_fop_processor(self, formatter, medium): args = [] if formatter.lower() == 'fop': pth = os.path.join(self.script_path, self.gv.APACHE_FOP_PATH) if self.gv.check_program(pth): args = self.run_apache_fop(pth,formatter, medium) elif formatter.lower() == 'ah': pth = self.gv.ANTENNA_HOUSE_FOP_PATH if self.gv.check_program(pth): args = self.run_ah_fop(pth,formatter, medium) return args def run_ah_fop(self, pth, formatter, medium): args=[pth] args.append('-d') args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f), self.gv.uuid, formatter, medium)) args.append('-o') args.append('{}/{}.{}.{}.pdf'.format(self.dr, self.gv.uuid, formatter, medium)) return args def run_apache_fop(self, pth, formatter, medium): style_path = '{}/configurations/fop/conf/{}.{}.xml'.format(self.script_path, formatter,medium) args = [pth] args.append('-fo') args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f),self.gv.uuid, formatter, medium)) args.append('-pdf') args.append('{}/{}.{}.{}.pdf'.format(self.dr,self.gv.uuid, formatter, medium)) args.append('-c') args.append(style_path) return args def run_saxon(self, saxon_path, formatter, medium): """ Creates the executable path for saxon Parameters --------- saxon_path : str absolute path of the saxon binary jar file formatter : str name of the FO formatter medium : str name of the medium Returns ------ args:list List of arguments for saxon execution path """ args = ["java", "-jar", saxon_path] if self.args.get('--xsl'): xsl = self.script_path.split(os.sep)[:-1] xsl.append('stylesheets') xsl.append(self.args.get('--xsl')) args.append("-xsl:" + os.sep.join(xsl)) s = self.args.get('<input_file>') if os.path.exists(s): args.append("-s:" + s) else: self.debug.print_debug(self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + s) sys.exit(1) file_name = '.'.join([self.gv.uuid,formatter.lower(),medium.lower(),'fo']) args.append("-o:" + os.path.join(self.args.get('<path>'), file_name)) args.append('formatter=' + formatter.lower()) args.append('medium=' + medium.lower()) return args
class ChronicWordFreq (Debuggable): def __init__(self): # read command line arguments self.args = self.read_command_line() # absolute first priority is to initialize debugger so that anything triggered here can be logged self.debug = Debug() Debuggable.__init__(self, 'CWF') self.corpus = self.args['<corpus_directory>'] self.words = self.args['<word_list>'].split(",") self.output = self.args['<output_csv>'] self.terms = {} self.years = [] self.year_count = {} if self.args['--debug']: self.debug.enable_debug() self.debug.enable_prompt(Interactive(self.args['--debug'])) @staticmethod def read_command_line(): return docopt(__doc__, version='chronicWordFreq 0.1') def read_file(self, file): match = re.search('\d{4}', file) year = match.group(0) if match else 'NODATE' if year == 'NODATE': self.debug.print_debug(self, u'No date detected in filename: {0}. Ignoring.'.format(file)) return self.debug.print_debug(self, u'Processing {0} for year {1}.'.format(file, year)) if not year in self.years: self.years.append(year) if not year in self.year_count: self.year_count[year] = 1 else: self.year_count[year] += 1 with open(join(self.corpus, file)) as f: content = f.read() content = content.upper() for word in self.words: if word.upper() in content: if word in self.terms: if year in self.terms[word]: current_value = self.terms[word][year] current_value += 1 self.terms[word][year] = current_value else: self.terms[word][year] = 1 else: self.terms[word] = {year: 1} self.debug.print_debug(self, u'Found {0} in {1}.'.format(word, file)) def read_dir(self): files = [f for f in listdir(self.corpus) if isfile(join(self.corpus, f))] return files def write_output(self): self.years.sort() output_list = [u'{0},{1}\n'.format('Word', ",".join(self.years))] for word in self.words: line = word if word in self.terms: for year in self.years: if year in self.terms[word]: percent = (float(self.terms[word][year]) / float(self.year_count[year])) * 100 line += u',{0}'.format(percent) else: line += u',0' output_list.append(line + '\n') with open(self.output, 'w') as f: f.writelines(output_list) def run(self): file_list = self.read_dir() for file in file_list: self.read_file(file) self.write_output()
class Merge(Debuggable): """ Standalone Processing object which merges current JATS/BITS XML file in to the Body of a BITS-XML document. """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.gv = GV() self.uid = self.gv.uuid self.dr = self.args.get("<path>") self.f = self.args.get("<input_file>") self.scheme = self.args.get("<scheme>") self.set_numbering_tags = self.args.get("--set-numbering-tags") self.tr = etree.parse(os.path.join(self.dr, self.f)) Debuggable.__init__(self, "Main") if self.args.get("--debug"): self.debug.enable_debug() @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version="xmlMerge 0.0.1") def create_output_bits(self): """ Create bits output file, generates a new file, if no file is found. Otherwise the current file is appended to the book body as a book-part. See Also -------- create_book_part_bits, create_book_bits, do_file_io """ fuf = os.path.join(self.dr, self.uid) pt = os.path.join(self.dr, os.path.basename(self.uid)) trf = None if os.path.isfile(fuf): trf = etree.parse(fuf) bp = trf.find(".//book-body") book_part = self.create_book_part_bits() bp.append(book_part) else: trf = self.create_book_bits() trf = self.process(trf) self.do_file_io( etree.tostring(trf, pretty_print=True, xml_declaration=True, encoding="UTF-8", standalone="yes"), "w", pt ) def process(self, tr): """ Process BITS-XML file and do all transformations into the elementtree Parameters ---------- tr : elementtree element tree as input Returns ------- tr : elementtree transformed element tree See Also -------- globals.set_numbering_tags(), set_book_part_attributes() """ tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(","), tr) if self.set_numbering_tags else tr self.set_book_part_attributes(tr) return tr def set_book_part_attributes(self, tr): """ Add specific attributes to book-part Parameters ---------- tr : elementtree element tree as input Returns ------- tr : elementtree transformed element tree """ book_parts = tr.findall(".//book-part") for i, b in enumerate(book_parts): b.attrib["id"] = "ch_" + str(i) b.attrib["book-part-type"] = "chapter" return tr def create_metadata_path(self, metadata): """ creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata Parameters ---------- metadata : str Suffix of the metadata files Returns ------- pth : str Correct path of the metadata file in the folder structure Notes ----- We assume that metadata files are stored in a sub-folder named metadata """ p = os.path.dirname(self.f).split(os.sep) del p[-4:] name, ext = os.path.splitext(os.path.basename(self.uid)) file_name = [name, ".", metadata, ext] p.append("metadata") p.append("".join(file_name)) pth = os.sep.join(p) return pth def create_book_bits(self): """ creates a full BITS XML book and optionally adds metadata Returns ------- book : elementtree Elementtree which complies to BITS XML Schheme. See Also --------- create_metadata_path, create_book_part_bits """ nsmap = { "xlink": "http://www.w3.org/1999/xlink", "mml": "http://www.w3.org/1998/Math/MathML", "xml": "http://www.w3.org/XML/1998/namespace", } book = etree.Element(etree.QName("book"), nsmap=nsmap) book.attrib["dtd-version"] = "2.1" book.attrib[etree.QName("{http://www.w3.org/XML/1998/namespace}lang")] = "de" book.attrib["book-type"] = "proceedings" metadata = self.args.get("--metadata") if metadata: pth = self.create_metadata_path(metadata) if os.path.isfile(pth): bp = etree.parse(pth).find(".//book-meta") book.insert(0, bp) bd = etree.Element("book-body") bpbd = self.create_book_part_bits() bd.append(bpbd) book.append(bd) return book def create_book_part_bits(self): """ Reads a JATS XMl File and creates a book-part element tree according to BITS-XML. Returns ------- bp : elementtree Book part elementTree """ f, bd, bk = self.get_xml_parts() bp = etree.Element("book-part") if f is not None: if len(f): bp.append(f) bp.append(bd) bp.append(bk) return bp def get_xml_parts(self): """ Returns the front-matter , body and back-matter of a JATS XML file in the above order Returns ------- f : elementtree Front-matter of JATS elementTree bd : elementtree Body of JATS elementTree bk : elementtree Back-matter of JATS elementTree """ r = self.tr.getroot() f = r.find(".//front") if f is None: f = r.find(".//book-part-meta") bd = r.find(".//body") bk = r.find(".//back") return f, bd, bk def do_file_io(self, s, mode, pth): """ Executes read or write operations on a path Parameters ---------- s: str Content to be written or None for read mode: str w for write , r for r pth : str Path to the file to be read or written Raises ------ IOError I/O operation fails """ try: w = open(pth, mode) if mode == "w": w.write(s) w.close() if mode == "r": o = w.read() w.close() except IOError as i: self.debug.print_debug(self, i) print(i) sys.exit(1) def run(self): """ Runs the configuration on the processing object. Process JATS-XML file and merges it into the full BITS-XML file See Also -------- create_output_bits Warning ------- function create_output_jats not yet used """ self.gv.create_dirs_recursive(self.dr.split("/")) if self.scheme == "bits": self.create_output_bits() elif self.scheme == "jats": self.tr = self.create_output_jats(self.tr)
class KernelDensity (Debuggable): def __init__(self): # read command line arguments self.args = self.read_command_line() # absolute first priority is to initialize debugger so that anything triggered here can be logged self.debug = Debug() Debuggable.__init__(self, 'plotsummary') self.in_dir = self.args['<directory>'] self.term_file = self.args['<term_file>'] self.terms = [line.strip().lower() for line in open(self.term_file)] self.dir = os.path.dirname(os.path.abspath(__file__)) if self.args['--debug']: self.debug.enable_debug() self.debug.enable_prompt(Interactive(self.args['--debug'])) if self.args['--caption']: self.caption = self.args['--caption'] else: self.caption = 'Term Plot' if self.args['--nostem']: self.nostem = self.args['--nostem'] else: self.nostem = None if self.args['single']: self.action = 'single' elif self.args['group']: self.second_term_file = self.args['<second_term_file>'] self.term_name = self.args['<term_name>'] self.second_term_name = self.args['<second_term_name>'] self.second_terms = [line.strip().lower() for line in open(self.second_term_file)] self.action = 'group' elif self.args['hist']: self.action = 'hist' elif self.args['rawcount']: self.action = 'rawcount' @staticmethod def read_command_line(): return docopt(__doc__, version='kernel-density-estimation v0.1') def run(self): if self.args['--debug']: if self.nostem: with open(self.nostem) as f: nostem_words = set(f.read().splitlines()) else: nostem_words = [] for term in self.terms: if not term in nostem_words: self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term))) else: self.debug.print_debug(self, u'{0} will not be stemmed'.format(term)) if self.action == 'group': for term in self.second_terms: if not term in nostem_words: self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term))) else: self.debug.print_debug(self, u'{0} will not be stemmed'.format(term)) file_list = listdir(self.in_dir) for file_name in file_list: if file_name.endswith(".txt"): self.plot(file_name) def plot(self, file_name): self.debug.print_debug(self, u'Loading ' + file_name) textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem) self.debug.print_debug(self, u'Plotting ' + file_name) if self.action == 'single': graph = textplot.plot_terms(self.terms, self.caption) elif self.action == 'group': graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption) elif self.action == 'hist': graph = textplot.plot_terms_histogram(self.terms, self.caption, 5000) elif self.action == 'rawcount': graph = textplot.plot_terms_raw_count(self.terms, self.caption, 5000) self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png')) graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png'))) graph.close()
class MPT(Debuggable): """ MPT Class Object, which initializes the properties and defines the methods. """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.settings = Settings(self.args) self.gv = GV(self.settings) Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.current_result = datetime.datetime.now().strftime( "%Y_%m_%d-%H-%M-%S-") + str(uuid.uuid4())[:4] self.config = None self.all_typesetters = None self.script_folder = os.path.dirname(os.path.realpath(__file__)) @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='heiMPT 0.0.1') def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'heiMPT' return name def call_typesetter(self, args): """Runs typesetter with given arguments Creates the execution path for a typesetter or an application and runs it as a system process. Output, exit-code and system error codes are captured and returned. Parameters ---------- args : list application arguments in the correct oder. Returns ------- output :str system standard output. err :str system standard error. exit_code: str system exit_code. See Also -------- subprocess.Popen() """ args_str = ' '.join(args) if ': ' in args_str: args_str = args_str.replace(': ', ':') self.debug.print_debug( self, "Merging command: file into command:file, can be a problem for some applications" ) m = args_str.strip().split(' ') process = Popen(m, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() return output, err, exit_code def arguments_parse(self, t_props): """ Reads typesetter properties from json configuration and create arguments. Parameters ---------- t_props : dictionary typesetter properties Returns ------- args : list application execution path and arguments in the correct oder. """ args = [] if t_props.get('executable'): args = [t_props.get('executable')] else: self.debug.print_debug( self, self.gv.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED) sys.exit(1) arguments = t_props.get("arguments") if arguments: arguments = collections.OrderedDict(sorted(arguments.items())) for a in arguments: args.append(arguments[a]) return args def create_output_path(self, p, p_id, args, prefix, uid): """ Creates the output path for the current file Output folder is constructed using project_name, current_time, sequence number of the current typesetter and the sequence number of the current file. Parameters --------- p: dictionary json program properties p_id: int typesetter id args : list application arguments in the correct oder. prefix: str file name prefix of the current file uid: str unique id of the current current typesetter Returns -------- True: boolean Returns True if the output file is created See Also -------- os.makedirs() """ config_args = p.get('typesetters')[p_id].get("arguments") if config_args is None: self.debug.print_debug(self, self.gv.TYPESETTER_ARGUMENTS_NOT_DEFINED) sys.exit(1) ts_args = collections.OrderedDict(sorted(config_args.items())) out_type = p.get('typesetters')[p_id].get("out_type") out_path = os.path.join(p.get('path'), uid) for i in ts_args: arg = ts_args[i] if arg == '--create-dir': args.append(out_path) else: args.append(arg) self.debug.print_debug(self, '{} {}'.format('Execute', ' '.join(args))) return True def run_typesetter(self, p, pre_path, pre_out_type, p_id, uid, f_id, f_name, args): """ Creates the temporary output path, calls the typesetter and writes the outtput to the correct path for a certain file Parameters --------- p: dictionary json program properties pre_path: str project path of the previous iteration pre_out_type : str output type of the previous iteration p_id: int typesetter id uid: str unique id of the current current typesetter f_id: int sequence number of the current file f_name: str name of the current file args : list application arguments in the correct oder. Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- call_typesetter, organize_output """ p_path = '' pf_type = '' prefix = f_name.split('.')[0] if p_id == min(i for i in p['typesetters']): f_path = os.path.join(p.get('path'), f_name) elif p.get("chain"): f_path = os.path.join(pre_path, prefix + '.' + pre_out_type) if os.path.isfile(f_path) or p['typesetters'].get(p_id).get('expand'): self.debug.print_console( self, '\t{}:\t {} '.format('Processing', prefix)) self.gv.log.append(prefix) args.append(f_path) self.create_output_path(p, p_id, args, prefix, uid) output, err, exit_code = self.call_typesetter(args) self.debug.print_debug(self, output.decode('utf-8')) p_path = self.organize_output(p, p_id, prefix, f_id, uid, args) pf_type = p.get('typesetters')[p_id].get("out_type") else: self.debug.print_debug( self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + os.path.join(f_path)) return p_path, pf_type def typeset_file(self, p, pre_path, pre_out_type, p_id, uid, f_id, f_name): """ Typesets the current file Parameters --------- p: dictionary json program properties pre_path: str project path of the previous iteration pre_out_type : str output type of the previous iteration p_id: int typesetter id uid: str unique id of the current current typesetter f_id: int sequence number of the current file f_name: str name of the current file args: list application arguments in the correct oder. Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- run_typesetter """ t_props = self.all_typesetters.get( p.get('typesetters')[p_id].get("name")) p_path, pf_type = '', '' if t_props: mt = self.arguments_parse(t_props) if self.gv.check_program(t_props.get('executable')): p_path, pf_type = self.run_typesetter(p, pre_path, pre_out_type, p_id, uid, f_id, f_name, mt) else: self.debug.print_debug( self, t_props.get('executable') + self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE) else: self.debug.print_debug(self, self.gv.PROJECT_TYPESETTER_IS_NOT_AVAILABLE) return p_path, pf_type def typeset_files(self, p, pre_path, pre_out_type, pre_id): """ Typeset all files of a certain project Parameters --------- p: dictionary json program properties pre_path: str project path of the previously executed typesetter pre_out_type: str project file type of the previously executed typesetter pre_id :int sequence number of the previously executed file Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- typeset_file """ p_path, pf_type = '', '' uid = str(uuid.uuid4()) project_files = collections.OrderedDict( sorted((int(key), value) for key, value in list(p.get('files').items()))) if p.get('typesetters')[pre_id].get("expand"): f_name = self.gv.uuid p_path, pf_type = self.typeset_file(p, pre_path, pre_out_type, pre_id, uid, 0, f_name) else: for f_id in project_files: f_name = project_files[f_id] p_path, pf_type = self.typeset_file(p, pre_path, pre_out_type, pre_id, uid, f_id, f_name) return p_path, pf_type def typeset_project(self, p): """ Typesets a certain project Parameters --------- p: dictionary json program properties Returns -------- True: boolean Returns True, if all the typesetters in project has run successfully. See Also -------- typeset_files """ typesetters_ordered, temp_path, temp_pre_out_type = '', '', '' pre_path = '' prev_out_type = '' if p.get('active'): self.debug.print_console(self, 'PROJECT : ' + p.get('name')) self.gv.log.append(p.get("name")) ts = p.get('typesetters') if ts: typesetters_ordered = collections.OrderedDict( sorted(ts.items())) else: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED) if self.all_typesetters is None: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED) sys.exit(1) for p_id in typesetters_ordered: self.debug.print_console( self, ' '.join([ 'Step', p_id, ':', '\t', p.get('typesetters')[p_id].get("name") ])) self.gv.log.append('{} {}'.format( p_id, p.get('typesetters')[p_id].get("name"))) temp_path, temp_pre_out_type = self.typeset_files( p, pre_path, prev_out_type, p_id) pre_path = temp_path prev_out_type = temp_pre_out_type else: self.debug.print_debug( self, self.gv.PROJECT_IS_NOT_ACTIVE + ' ' + p.get('name')) return True def typeset_all_projects(self): """ Typeset all projects defined in the json file Returns -------- True: boolean Returns True, if the all the typesetters in project run See Also -------- typeset_project """ projects = self.config.get('projects') if projects: for p in projects: self.typeset_project(p) else: self.debug.print_debug(self, self.gv.PROJECTS_VAR_IS_NOT_SPECIFIED) return True def organize_output(self, p, p_id, prefix, f_id, uid, args): """ Copy the temporary results into the final project path This method reads the temporary results of the current typesetter step and copies them in to the correct output folder. Output folder is constructed using project_name, current_time, sequence number of the current typesetter and the sequence number of the current file. Customized tool specific actions are also defined and handled here. Parameters ------------ p: dict json program properties p_id: int typesetter id prefix: str file name prefix of the current file f_id: int sequence number of the current file uid: str unique id of the current current typesetter args: bytearray tool parameters , executable file is first element Returns -------- project_path: str Final path for the current file See Also -------- create_merged_file, gv.create_dirs_recursive """ p_name = p.get('typesetters')[p_id].get("name") t_path = [p.get('path'), uid] if args: if len([arg for arg in args if 'meTypeset.py' in arg]) > 0: t_path += ['nlm'] else: t_path += [p.get('path'), uid] out_type = p['typesetters'][p_id].get('out_type') if out_type is None: self.debug.print_console( self, self.gv.PROJECT_OUTPUT_FILE_TYPE_IS_NOT_SPECIFIED) sys.exit(1) project_path = [ p.get('path'), p['name'], self.current_result, p_id + '_' + p_name, out_type ] temp_dir = os.path.join(p.get('path'), uid) if p['typesetters'][p_id].get('merge'): self.create_merged_file(p, p_id, project_path, t_path) if len(list(p.get('files').items())) == f_id: shutil.rmtree(temp_dir) elif p['typesetters'][p_id].get('expand'): for filename in os.listdir(temp_dir): p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}'.format(p_path, SEP, filename) os.rename(os.path.join(temp_dir, filename), f_path) shutil.rmtree(temp_dir) elif p['typesetters'][p_id].get('process'): if p_name.lower() == 'metypeset' and not os.path.exists( SEP.join(t_path)): t_path.append('nlm') t_path.append(prefix + '.' + out_type) p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}.{}'.format(p_path, SEP, prefix, out_type) try: os.rename(SEP.join(t_path), f_path) shutil.rmtree(temp_dir) except FileNotFoundError: print('File not found\t{}', SEP.join(t_path)) sys.exit(1) else: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED) if len(list(p.get('typesetters').items())) == int(p_id) and int( f_id) == len(list(p.get('files').items())): zip_path = ''.join([p.get('path'), SEP, p['name']]) shutil.make_archive('{}/{}'.format(zip_path, p.get("name")), 'zip', zip_path) return SEP.join(project_path) def create_merged_file(self, p, p_id, project_path, t_path): """ Create a combined file from a set of input files Parameters ------------ p: dict json program properties p_id: int typesetter id t_path : str temporary output directory project_path : str system path to be created See Also -------- create_named_file() """ t_path.append(self.gv.uuid) p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}.xml'.format(p_path, SEP, self.gv.uuid) shutil.copy2(SEP.join(t_path), f_path) self.create_named_file(p, p_id, p_path, t_path) return f_path def create_named_file( self, p, p_id, p_path, t_path, ): """ Copy unique file name to a named file p: dict json program properties p_id: int typesetter id t_path : str temporary output directory p_path : str output directory for the current typesetter """ f = p['typesetters'][p_id].get('out_file') if f: shutil.copy2(SEP.join(t_path), '{}{}{}'.format(p_path, SEP, f)) return def run_modules(self): """ Run MPT in module mode """ # Run import modules if self.args.get('import'): sys.path.insert( 0, os.path.join(self.script_folder, 'plugins', 'import')) import ImportInterface if self.args.get('omp'): m = "omp" plugin_package = __import__(m, fromlist=['*']) plugin_module = getattr(plugin_package, m) # Find class inheriting form Import abstract class in the module for name in dir(plugin_module): candidate = getattr(plugin_module, name) if inspect.isclass(candidate)\ and issubclass(candidate, ImportInterface.Import)\ and candidate is not ImportInterface.Import: plugin_class = candidate print(("Found import plugin", name, plugin_class)) plugin = plugin_class() self.debug.print_console(self, str(self.args)) plugin.run(self.args, {'base-path': self.script_folder}) # try: # plugin_module = __import__(m) # plugin_module.plugin.run() # except Exception as e: # print('{} {}: {}'.format(m, 'method import failed', e)) # sys.exit(0) else: self.debug.fatal_error(self, "Unsupported arguments: " + self.args) return def check_applications(self): """ Check if program binaries are available """ ps = self.config.get('projects') psf = [s for s in ps if s.get('active') == True] ts = self.config.get('typesetters') for p in [ts[i]['arguments'] for i in ts]: for k in [ j for j in list(p.values()) if j.find('--formatter') == 0 ]: for l in k.split('=')[1].split(','): if not self.gv.check_program(self.gv.apps.get(l.lower())): self.debug.fatal_error( self, '{} {}'.format( self.gv.apps.get(l.lower()), self.gv.apps.get(l.lower()) + self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE)) sys.exit(1) for p in [ts[i]['executable'] for i in ts]: if not self.gv.check_program(p): self.debug.fatal_error( self, '{} {}'.format( p, self.gv.apps.get(l.lower()) + self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE)) sys.exit(1)
class KernelDensity (Debuggable): def __init__(self): # read command line arguments self.args = self.read_command_line() # absolute first priority is to initialize debugger so that anything triggered here can be logged self.debug = Debug() Debuggable.__init__(self, 'plotsummary') self.in_dir = self.args['<directory>'] if self.args['<term_file>']: self.term_file = self.args['<term_file>'] self.terms = [line.strip().lower() for line in open(self.term_file)] elif self.args["<first_term>"] and self.args["<second_term>"]: self.terms = [] self.terms.append(self.args["<first_term>"]) self.terms.append(self.args["<second_term>"]) elif self.args["<term>"]: self.terms = [] self.terms.append(self.args["<term>"]) if self.args["<count>"]: self.max = int(self.args["<count>"]) self.dir = os.path.dirname(os.path.abspath(__file__)) if self.args['--debug']: self.debug.enable_debug() self.debug.enable_prompt(Interactive(self.args['--debug'])) if self.args['--caption']: self.caption = self.args['--caption'] else: self.caption = 'Term Plot' if self.args['--nostem']: self.nostem = self.args['--nostem'] else: self.nostem = None if self.args['single']: self.action = 'single' elif self.args['group']: self.second_term_file = self.args['<second_term_file>'] self.term_name = self.args['<term_name>'] self.second_term_name = self.args['<second_term_name>'] self.second_terms = [line.strip().lower() for line in open(self.second_term_file)] self.action = 'group' elif self.args['hist']: self.action = 'hist' elif self.args['rawcount']: self.action = 'rawcount' elif self.args['overlap']: self.action = 'overlap' elif self.args['search']: self.action = 'search' if self.args['--words']: self.words = int(self.args['--words']) else: self.words = 5000 @staticmethod def read_command_line(): return docopt(__doc__, version='kernel-density-estimation v0.1') def run(self): if self.args['--debug']: if self.nostem: with open(self.nostem) as f: nostem_words = set(f.read().splitlines()) else: nostem_words = [] for term in self.terms: if not term in nostem_words and term != Text.show_stem(term): self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term))) else: self.debug.print_debug(self, u'{0} will not be stemmed'.format(term)) if self.action == 'group': for term in self.second_terms: if not term in nostem_words: self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term))) else: self.debug.print_debug(self, u'{0} will not be stemmed'.format(term)) file_list = listdir(self.in_dir) for file_name in file_list: if file_name.endswith(".txt"): self.plot(file_name) def plot(self, file_name): self.debug.print_debug(self, u'Loading ' + file_name) textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem) self.debug.print_debug(self, u'Plotting ' + file_name) if self.action == 'single': graph = textplot.plot_terms(self.terms, self.caption) elif self.action == 'group': graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption) elif self.action == 'hist': graph = textplot.plot_terms_histogram(self.terms, self.caption, self.words) elif self.action == 'rawcount': graph = textplot.plot_terms_raw_count(self.terms, self.caption, self.words) elif self.action == 'overlap': graph = textplot.plot_kde_overlap(self.terms) elif self.action == 'search': newterms = textplot.anchored_scores(self.terms[0]) count = 0 self.debug.print_(self, u'Top twenty correlated terms (with more than one occurrence) for {0}: '.format(self.terms[0])) for item in newterms: if len(textplot.terms[item]) > 1 and item != textplot.stem(self.terms[0]): if count > self.max: break self.debug.print_(self, item) count += 1 if self.action != 'search': self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png')) graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png'))) graph.close()
class MPT(Debuggable): """ MPT Class Object, which initializes the properties and defines the methods. """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.gv = GV() Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.current_result = datetime.datetime.now().strftime( "%Y_%m_%d-%H-%M-") + str(uuid.uuid4())[:8] self.config = self.gv.read_json(self.args['<config_file>']) self.all_typesetters = self.config.get('typesetters') def run(self): """ Runs the MPT Module, which typesets all the projects defined in the json input file Returns -------- True: boolean Returns True if all the projects are typeset See Also -------- typeset_all_projects """ self.typeset_all_projects() return True @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='mpt 0.0.1') def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'MPT' return name def call_typesetter(self, args): """Runs typesetter with given arguments Creates the execution path for a typesetter or an application and runs it as a system process. Output, exit-code and system error codes are captured and returned. Parameters ---------- args : list application arguments in the correct oder. Returns ------- output :str system standard output. err :str system standard error. exit_code: str system exit_code. See Also -------- subprocess.Popen() """ m = ' '.join(args).strip().split(' ') self.debug.print_console(self, ' '.join(m)) process = Popen(m, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() return output, err, exit_code def arguments_parse(self, t_props): """ Reads typesetter properties from json configuration and create arguments. Parameters ---------- t_props : dictionary typesetter properties Returns ------- args : list application execution path and arguments in the correct oder. """ args = [] if t_props.get('executable'): args = [t_props.get('executable')] else: self.debug.print_debug( self, self.gv.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED) sys.exit(1) arguments = t_props.get("arguments") if arguments: arguments = collections.OrderedDict(sorted(arguments.items())) for a in arguments: args.append(arguments[a]) return args def create_output_path( self, p, p_id, args, prefix, uid): """ Creates the output path for the current file Output folder is constructed using project_name, current_time, sequence number of the current typesetter and the sequence number of the current file. Parameters --------- p: dictionary json program properties p_id: int typesetter id args : list application arguments in the correct oder. prefix: str file name prefix of the current file uid: str unique id of the current current typesetter Returns -------- True: boolean Returns True if the output file is created See Also -------- os.makedirs() """ ts_args = collections.OrderedDict( sorted(p.get('typesetters')[p_id].get("arguments").items())) out_type = p.get('typesetters')[p_id].get("out_type") out_path = os.path.join(p.get('path'), uid) for i in ts_args: arg = ts_args[i] if arg == 'create_output_directory()': args.append(out_path) elif arg == 'create_output_file()': if not os.path.exists(out_path): os.makedirs(out_path) args.append( os.path.join( out_path, prefix + '.' + out_type)) else: args.append(arg) return True def run_typesetter( self, p, pre_path, pre_out_type, p_id, uid, f_id, f_name, args): """ Creates the temporary output path, calls the typesetter and writes the outtput to the correct path for a certain file Parameters --------- p: dictionary json program properties pre_path: str project path of the previous iteration pre_out_type : str output type of the previous iteration p_id: int typesetter id uid: str unique id of the current current typesetter f_id: int sequence number of the current file f_name: str name of the current file args : list application arguments in the correct oder. Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- call_typesetter, organize_output """ p_path = '' pf_type = '' prefix = f_name.split('.')[0] if p_id == min(i for i in p['typesetters']): f_path = os.path.join(p.get('path'), f_name) elif p.get("chain"): f_path = os.path.join(pre_path,prefix +'.' + pre_out_type) if os.path.isfile(f_path) or p['typesetters'].get(p_id).get('expand'): args.append(f_path) self.create_output_path(p, p_id, args, prefix, uid) output, err, exit_code = self.call_typesetter(args) self.debug.print_debug(self, output.decode('utf-8')) p_path = self.organize_output( p, p_id, prefix, f_id, uid) pf_type = p.get('typesetters')[p_id].get("out_type") else: self.debug.print_debug( self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + os.path.join(f_path)) return p_path, pf_type def typeset_file( self, p, pre_path, pre_out_type, p_id, uid, f_id, f_name ): """ Typesets the current file Parameters --------- p: dictionary json program properties pre_path: str project path of the previous iteration pre_out_type : str output type of the previous iteration p_id: int typesetter id uid: str unique id of the current current typesetter f_id: int sequence number of the current file f_name: str name of the current file args: list application arguments in the correct oder. Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- run_typesetter """ t_props = self.all_typesetters.get(p.get('typesetters')[p_id].get("name")) p_path, pf_type = '','' if t_props: mt = self.arguments_parse(t_props) if self.gv.check_program(t_props.get('executable')): p_path, pf_type = self.run_typesetter( p, pre_path, pre_out_type, p_id, uid, f_id, f_name, mt) else: self.debug.print_debug(self, self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE) else: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTER_IS_NOT_AVAILABLE) return p_path, pf_type def typeset_files( self, p, pre_path, pre_out_type, pre_id): """ Typeset all files of a certain project Parameters --------- p: dictionary json program properties pre_path: str project path of the previously executed typesetter pre_out_type: str project file type of the previously executed typesetter pre_id :int sequence number of the previously executed file Returns -------- p_path : str project output path of the current typesetter pf_type : str project file type of the current typesetter See Also -------- typeset_file """ p_path, pf_type = '', '' uid = str(uuid.uuid4()) project_files = collections.OrderedDict( sorted((int(key), value) for key, value in p.get('files').items())) if p.get('typesetters')[pre_id].get("expand"): f_name = self.gv.uuid p_path, pf_type = self.typeset_file( p, pre_path, pre_out_type, pre_id, uid, 0, f_name ) else: for f_id in project_files: f_name = project_files[f_id] p_path, pf_type = self.typeset_file( p, pre_path, pre_out_type, pre_id, uid, f_id, f_name ) return p_path, pf_type def typeset_project(self, p): """ Typesets a certain project Parameters --------- p: dictionary json program properties Returns -------- True: boolean Returns True, if all the typesetters in project has run successfully. See Also -------- typeset_files """ typesetters_ordered, temp_path, temp_pre_out_type = '', '', '' pre_path = '' prev_out_type = '' if p.get('active'): ts = p.get('typesetters') if ts: typesetters_ordered = collections.OrderedDict( sorted(ts.items())) else: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED) if self.all_typesetters is None: self.debug.print_debug( self, self.gv.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED) sys.exit(1) for p_id in typesetters_ordered: self.debug.print_console(self, ' '.join(['Runnning Typesetter',p_id,':', p.get('typesetters')[p_id].get("name")])) temp_path, temp_pre_out_type = self.typeset_files( p, pre_path, prev_out_type, p_id ) pre_path = temp_path prev_out_type = temp_pre_out_type self.debug.print_console(self, ' '.join(['ls -al',temp_path])) else: self.debug.print_debug(self, self.gv.PROJECT_IS_NOT_ACTIVE) return True def typeset_all_projects(self): """ Typeset all projects defined in the json file Returns -------- True: boolean Returns True, if the all the typesetters in project run See Also -------- typeset_project """ projects = self.config.get('projects') if projects: for p in projects: self.typeset_project(p) else: self.debug.print_debug(self, self.gv.PROJECTS_VAR_IS_NOT_SPECIFIED) return True def organize_output( self, p, p_id, prefix, f_id, uid): """ Copy the temporary results into the final project path This method reads the temporary results of the current typesetter step and copies them in to the correct output folder. Output folder is constructed using project_name, current_time, sequence number of the current typesetter and the sequence number of the current file. Customized tool specific actions are also defined and handled here. Parameters ------------ p: dict json program properties p_id: int typesetter id prefix: str file name prefix of the current file f_id: int sequence number of the current file uid: str unique id of the current current typesetter Returns -------- project_path: str Final path for the current file See Also -------- create_merged_file, gv.create_dirs_recursive """ p_name = p.get('typesetters')[p_id].get("name") t_path = [p.get('path'), uid] + ['nlm'] if p_name == 'metypeset' else [p.get('path'), uid] out_type = p['typesetters'][p_id]['out_type'] project_path = [p.get('path'),p['name'], self.current_result, p_id + '_' + p_name,out_type] temp_dir = os.path.join(p.get('path'), uid) if p['typesetters'][p_id].get('merge'): self.create_merged_file(p, p_id, project_path, t_path) if len(p.get('files').items()) == f_id: shutil.rmtree(temp_dir) elif p['typesetters'][p_id].get('expand'): for filename in os.listdir(temp_dir): p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}'.format(p_path,SEP,filename) os.rename(os.path.join(temp_dir,filename), f_path) shutil.rmtree(temp_dir) elif p['typesetters'][p_id].get('process'): t_path.append(prefix + '.' + out_type) p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}.{}'.format(p_path, SEP, prefix, out_type) os.rename(SEP.join(t_path), f_path) shutil.rmtree(temp_dir) else: self.debug.print_debug(self, self.gv.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED) #self.debug.print_console(self, '{} {}'.format(self.gv.OUTPUT,f_path)) return SEP.join(project_path) def create_merged_file(self, p, p_id, project_path, t_path): """ Create a combined file from a set of input files Parameters ------------ p: dict json program properties p_id: int typesetter id t_path : str temporary output directory project_path : str system path to be created See Also -------- create_named_file() """ t_path.append(self.gv.uuid) p_path = self.gv.create_dirs_recursive(project_path) f_path = '{}{}{}.xml'.format(p_path,SEP ,self.gv.uuid) shutil.copy2(SEP.join(t_path), f_path) self.create_named_file(p, p_id, p_path, t_path) return f_path def create_named_file(self, p, p_id, p_path ,t_path,): """ Copy unique file name to a named file p: dict json program properties p_id: int typesetter id t_path : str temporary output directory p_path : str output directory for the current typesetter """ f = p['typesetters'][p_id].get('out_file') if f: f_path = '{}{}{}'.format(p_path, SEP,f) shutil.copy2(SEP.join(t_path), f_path) return
class MePrePrint (Debuggable): def __init__(self): # read command line arguments self.args = docopt(__doc__, version='meTypeset 0.1') # initialize debugger self.debug = Debug() self.debug.enable_debug() Debuggable.__init__(self, 'mePrePrint') # get arguments self.doc_type = self.args['--type'] self.title = self.args['--article_title'] self.name = self.args['--author'] self.copyright_year = self.args['--year'] self.copyright = self.args['--copyright'] self.citation = self.args['--citation'] self.url = self.args['--url'] if self.doc_type == 'preprint': self.version = 'pre-print (not peer reviewed)' elif self.doc_type == 'postprint': self.version = 'post-print (peer reviewed)' elif self.doc_type == 'final': self.version = 'final publisher' @staticmethod def copy(src, dst): try: shutil.copytree(src, dst) except OSError as exc: if exc.errno == errno.ENOTDIR: shutil.copy(src, dst) else: raise @staticmethod def do_replace(in_string, replace_text, substitute): return in_string.replace(replace_text, substitute) @staticmethod def zip_dir(path, zip_file, final): relative = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), final) for root, dirs, files in os.walk(path): for file_name in files: zip_file.write(os.path.join(root, file_name), os.path.relpath(os.path.join(root, file_name), os.path.join(path, relative))) def create_coversheet(self, destination): # copy the coversheet to a temporary directory src = self.args['<input_cover>'] self.debug.print_debug(self, u'Copying coversheet') os.mkdir(os.path.join(destination, u'coversheet')) z = zipfile.ZipFile(src, "r") z.extractall(os.path.join(destination, u'coversheet')) # open the document XML self.debug.print_debug(self, u'Replacing cover sheet variables') with open (u'{0}'.format(os.path.join(destination, u'coversheet/word/document.xml')), 'r+') as doc_file: contents = doc_file.read() contents = self.do_replace(contents, '{ARTICLE_TITLE}', self.title) contents = self.do_replace(contents, '{AUTHOR_NAME}', self.name) contents = self.do_replace(contents, '{VERSION}', self.version) contents = self.do_replace(contents, '{JOURNAL_CITATION}', self.citation) contents = self.do_replace(contents, '{URL}', self.url) contents = self.do_replace(contents, '{COPYRIGHT}', self.copyright) contents = self.do_replace(contents, '{COPYRIGHT_YEAR}', self.copyright_year) doc_file.seek(0) doc_file.write(contents) doc_file.truncate() self.debug.print_debug(self, u'Replacing cover sheet hyperlinks') with open (u'{0}'.format(os.path.join(destination, u'coversheet/word/_rels/document.xml.rels')), 'r+') as doc_file: contents = doc_file.read() contents = self.do_replace(contents, '{URL}', self.url) doc_file.seek(0) doc_file.write(contents) doc_file.truncate() # re-package the file into a docx z = zipfile.ZipFile(os.path.join(destination, u'final_cover.docx'), "w") self.zip_dir(os.path.join(destination, u'coversheet'), z, 'coversheet') pdf = os.path.join(destination, u'final_cover.pdf') command = 'unoconv -f pdf {0}'.format(os.path.join(destination, u'final_cover.docx')) self.debug.print_debug(self, 'Running: {0}'.format(command)) subprocess.call(command.split(' ')) # remove the temporary file return pdf def run(self): # create temporary directory temp_dir = tempfile.mkdtemp() self.debug.print_debug(self, u'Making temporary directory {0}'.format(temp_dir)) # create the coversheet pdf = self.create_coversheet(temp_dir) # convert the user's document into a PDF user_file = os.path.join(temp_dir, u'user.docx') user_pdf = os.path.join(temp_dir, u'user.pdf') shutil.copy(self.args['<input_article>'],user_file) command = 'unoconv -f pdf {0}'.format(user_file) self.debug.print_debug(self, 'Running: {0}'.format(command)) subprocess.call(command.split(' ')) # join the PDFs command = 'pdfunite {0} {1} {2}'.format(pdf, user_pdf, self.args['<output_file>']) self.debug.print_debug(self, 'Running: {0}'.format(command)) subprocess.call(command.split(' ')) # remove the temporary directory self.debug.print_debug(self, u'Removing temporary directory {0}'.format(temp_dir)) shutil.rmtree(temp_dir)
class Prepare(Debuggable): """ Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers. Features -------- add Id numbering for any tag type, clean comments, remove unused references, set numbering, add unique ids to certain tag types, sort references """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.settings = Settings(self.args) self.gv = GV(self.settings) Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.dr = self.args.get('<path>') self.f = self.args.get('<input_file>') self.stand_alone = self.args.get('--stand-alone') self.tr = etree.parse(os.path.join(self.dr, self.f)) @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='xml 0.1') def citations_to_references(self): """ Removes mixed-citation block, adds as a <sec> Section element Returns ------- tr : elementtree """ t = self.tr.getroot() bd = t.find('.//body') sc = etree.Element('sec') ttl = etree.Element('title') ttl.text = 'References' sc.append(ttl) mc = t.findall('.//mixed-citation') if len(mc) > 0: for r in mc: r.tag = 'p' sc.append(r) bd.append(sc) rlst = t.find('.//ref-list') rlst.getparent().remove(rlst) bck = t.find('.//back') bck.append(etree.Element('ref-list')) return self.tr def clean_references(self): """ removes references, which are not linked. Parameters ----------- tag : str name of the XML tag Returns ------- tr : elementtree See Also -------- remove_element, remove_tags """ r = self.tr.getroot() for e in r.findall('.//back/ref-list/ref'): if e.attrib.get('id'): if r.find(".//xref[@ref-type='bibr'][@rid='" + e.attrib.get('id') + "']") is None: self.remove_element(e) else: self.remove_element(e) for e in r.findall(".//xref[@ref-type='bibr']"): if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') + "']") is None: if e.getparent() is not None: for c in e.getparent().getiterator(): if c.tag == 'xref' and c.attrib.get( 'ref-type') == 'bibr': self.remove_tags(c) return self.tr def remove_tags(self, e): """ Takes an etree element and replaces it with its own text Parameters ---------- e : element Element to be replaced """ if e.getparent() is not None: previous = e.getprevious() if previous is not None: if previous.tail: if e.text: previous.tail = previous.tail + e.text if e.tail: previous.tail = previous.tail + e.tail e.getparent().remove(e) def remove_element(self, e): """ Remove any element only if it has a parent Parameters ---------- e : element Element to be replaced """ if e.getparent() is not None: e.getparent().remove(e) def set_uuids_for_back_matter(self, tags): """ Add unique id tags to any of the sub-elements of the back matter Parameters ---------- tags: list list of elements Returns ------- tr : elementtree """ for s in tags: f = {} ref_type = 'bibr' if s == 'ref' else s fns = self.tr.getroot().findall(''.join( ['.//xref/[@ref-type="', ref_type, '"]'])) for i in fns: rid = ''.join(['bibd', str(uuid.uuid4())]) f[i.attrib['rid']] = rid i.set('rid', rid) for m in list(f.keys()): n = self.tr.getroot().find(''.join( ['.//' + s + '/[@id="', m, '"]'])) if n is not None: n.set('id', f[m]) if len(n) > 0 else '' return self.tr def set_numbering_values(self, tag, attr, value, count, range_list): """ Adds numerical values to a tag in arguments list Parameters --------- tag: str xml tag name attr: str attribute name value :str value name count : int current sequence number range_list : list lower and upper level for the numbering See Also -------- set_roman_numbers """ searchTag = './/' + tag + '[@' + attr + '="' + value + '"]' elems = self.tr.getroot().findall(searchTag) range_count = 1 for elem in elems: elem.text, range_count = self.set_roman_numbers( count, range_count, range_list) count += 1 return self.tr, count def convert_int_to_roman(self, i): """ Converts an integer number into a roman number Parameters --------- i : int integer number Returns ------- result : str Roman number """ result = [] for integer, numeral in self.gv.numeral_map: count = i // integer result.append(numeral * count) i -= integer * count return ''.join(result) def set_roman_numbers(self, count, r_count, range_list): """ Converts a given set of elements defined by range_array into roman numbers Parameters --------- count :int r_count : int range_list : list lower and upper level for the numbering Returns ------- val : str r_count: int See Also -------- convert_int_to_roman """ val = str(count) if int(range_list[0]) <= count <= int(range_list[1]): val = self.convert_int_to_roman(r_count).lower() r_count += 1 else: val = str(count - r_count + 1) return val, r_count def merge_metadata(self, metadata): """ reads a metadata file path and merge its content into the metadata section Parameters ---------- metadata : str suffix of the metadata files Returns ------- tr : elementTree Element tree of the current file See Also ------- create_metadata_path """ r = self.tr.getroot() pth = self.create_metadata_path(metadata) if os.path.isfile(pth): fr = r.find('.//front') if len(fr): bg = r.find('.//body').getparent() fr.getparent().remove(fr) bpm = etree.parse(pth).find('.//book-part-meta') if bpm is None: bpm = etree.parse(pth).find('.') if bpm is not None: if bpm.getroottree().getroot().tag == 'front': bg.insert(0, bpm) else: self.debug.print_debug( self, 'front or bookpart metadata unspecified') sys.exit(1) else: bg.insert(0, bpm) else: self.debug.print_debug(self, 'front metadata unspecified') else: self.debug.print_debug( self, pth + self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST) sys.exit(1) return self.tr def create_metadata_path(self, metadata): """ creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata Parameters ---------- metadata : str Suffix of the metadata files Returns ------- pth : str Correct path of the metadata file in the folder structure Notes ----- We assume that metadata files are stored in a sub-folder named metadata """ p = os.path.dirname(self.f).split(os.sep) f = os.path.basename(self.f) name, ext = os.path.splitext(f) file_name = [name, '.', metadata, ext] if not self.stand_alone or not os.path.exists(os.sep.join(p)): del p[-4:] p.append('metadata') p.append(''.join(file_name)) pth = os.sep.join(p) return pth def sort_by_tags(self, tag_list, elem): """ Sorts a list of elements alphabetically Parameters ---------- tag_list : list A list of tag types elem : Element Element to be modified """ data = [] for e in elem: vl = [] for tag in tag_list: vl.append(e.findtext(".//" + tag)) vl.append(e) data.append(tuple(vl)) data.sort() elem[:] = [item[-1] for item in data] def sort_references(self, tag_list): """ Sort references based on the sub-elements list Parameters ---------- tag_list : list A list of tag types Returns ------- tr : elementTree Element tree of the current file See Also -------- sort_by_tags """ elem = self.tr.find('./back/ref-list') self.sort_by_tags(tag_list, elem) return self.tr def sort_footnotes(self, tag_list): """ Sort footnotes based on the sub-elements list Parameters ---------- tag_list : list A list of tag types Returns ------- tr : elementTree Element tree of the current file See Also -------- sort_by_tags """ elem = self.tr.find('./back/fn-group') self.sort_by_tags(tag_list, elem) return self.tr def process(self): """ Process JATS-XML file and do all transformations into the elementtree See Also -------- merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values """ citations_to_references = self.args.get('--citations-to-references') clean_references = self.args.get('--clean-references') set_numbering_tags = self.args.get('--set-numbering-tags') set_unique_ids = self.args.get('--set-uuids') sort_footnotes = self.args.get('--sort-footnotes') sort_references = self.args.get('--sort-references') set_numbering_values = self.args.get('--set-numbering-values') metadata = self.args.get('--metadata') self.tr = self.merge_metadata(metadata) if metadata else self.tr self.tr = self.citations_to_references( ) if citations_to_references else self.tr self.tr = self.clean_references() if clean_references else self.tr self.tr = self.gv.set_numbering_tags( set_numbering_tags.split(','), self.tr) if set_numbering_tags else self.tr self.tr = self.set_uuids_for_back_matter( set_unique_ids.split(',')) if set_unique_ids else self.tr self.tr = self.sort_footnotes( sort_footnotes.split(',')) if sort_footnotes else self.tr self.tr = self.sort_references( sort_references.split(',')) if sort_references else self.tr for s in set_numbering_values.split(';'): vals = s.split(',') count = 1 range_count = [0, 0] if len(vals) > 3: r = vals[3].lstrip('{').rstrip('}').split(':') range_count = [int(r[0]), int(r[1])] self.tr, count = self.set_numbering_values(vals[0], vals[1], vals[2], count, range_count) self.gv.create_dirs_recursive(self.dr.split('/')) self.create_xml_file(os.path.join(self.dr, os.path.basename(self.f))) def get_module_name(self): """ Reads the name of the module for debugging and logging Returns ------- name string Name of the Module """ name = 'prepare' return name def create_xml_file(self, pth): """ Write the current elementTree into the file path Parameters ---------- pth : str Correct path of the metadata file in the folder structure Raises ------ IOError I/O operation fails Notes ----- Default configuration writes a normalized XML file with XML scheme """ try: self.tr.write(pth, pretty_print=False, xml_declaration=True) print() except IOError as e: print(e) self.debug.print_debug(self, self.XML_FILE_NOT_CREATED) def run(self): """ Runs the configuration on the processing object See Also -------- process """ self.process()
class GV(object): ''' Global variables ''' def __init__(self): # GLOBAL VARIABLES #application paths self.APACHE_FOP_PATH = u'fop/fop' self.METYPESET_PATH = u'meTypeset/runtime/saxon9.jar' self.ANTENNA_HOUSE_FOP_PATH=u'/usr/AHFormatterV61_64/run.sh' self.XEP_FOP_PATH = u'/usr/local/xep/bin/xep/xep' # projects self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID = u'project input file json is not valid' self.PROJECT_INPUT_FILE_TYPE_IS_NOT_SPECIFIED = u'project input file type is not specified' self.PROJECT_INPUT_FILE_HAS_MORE_THAN_TWO_DOTS = u'project input file has more than two dots' self.PROJECT_INPUT_FILE_DOES_NOT_EXIST = u'project input_file does not exist' self.PROJECT_IS_NOT_ACTIVE = u'project is not active' self.PROJECT_OUTPUT_FILE_IS_NOT_DEFINED = u'project output file is not defined' self.PROJECT_OUTPUT_FILE_WAS_NOT_CREATED = u'project output file was not created' self.PROJECT_TYPESETTER_IS_NOT_AVAILABLE = u'project typesetter is not available' self.PROJECT_TYPESETTER_IS_NOT_SPECIFIED = u'project typesetter is not specified' self.PROJECT_TYPESETTER_NAME_IS_NOT_SPECIFIED = u'project typesetter name is not specified' self.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED = u'project typesetter varaible is not specified' self.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED = u'project typesetters are not specified' self.PROJECTS_VAR_IS_NOT_SPECIFIED = u'project variable is not specified' self.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED=u'project typesetter process method not specified' self.PROJECTS_TYPESETTER_RUNS_WITH_NO_ARGUMENTS = u'projects typesetter runs with no arguments' # typesetter errors self.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED = u'typesetter executable variable is undefined' self.TYPESETTER_FILE_OUTPUT_TYPE_IS_UNDEFINED = u'typesetter file output type is undefined' self.TYPESETTER_METADATA_FILE_WAS_NOT_SPECIFIED = u'Metadata file wasn\'t specified ' self.TYPESETTER_METYPESET_RUNS_WITH_DEFAULT_METADATA_FILE = u'typesetter metypeset runs with default metadata file' self.TYPESETTER_IS_NOT_SPECIFIED = u'typesetter is not specified ' self.TYPESETTER_PATH_IS_NOT_SPECIFIED = u'typesetter path is not specified ' self.TYPESETTER_BINARY_IS_UNAVAILABLE = u'typesetter binary is unavailable ' self.TYPESETTER_RUNS_WITH_NO_ARGUMENTS = u'typesetter runs with no arguments' # xml self.RUNNING_FO_CONVERSION = u'running FO conversion' self.RUNNING_PDF_CONVERSION = u'running PDF conversion' self.XML_ELEMENT_NOT_FOUND = u'xml element not found' self.XML_FILE_NOT_CREATED = u'xml file not created' self.XML_INPUT_FILE_IS_NOT_FOUND = u'xml input file is not found' self.XML_INPUT_FILE_IS_NOT_VALID = u'xml input file is not valid' self.SAXON_IS_NOT_AVAILABLE = u'saxon is not available' self.FOP_PATH_IS_NOT_AVAILABLE=u'fop path is not available' # WORDS self.OUTPUT = u'Output' self.debug = Debug() self.numeral_map = numeral_map self.uuid = '4e4dd8cf-26bf-4893-b037-1fd3bf08f112' self.version = '0.0.1' @staticmethod def fatal_error(module, message): """ Prints a formatted error message and exits Parameters ---------- module: python module Returns the name of the module message: str Error message See Also -------- module.get_module_name() """ print(u'[FATAL ERROR] [{0}] {1}'.format( module.get_module_name(), message)) sys.exit(1) def is_json(self, s): """ Checks whether a string is valid json string Parameters ---------- s : str JSON data as string Raises ------ ValueError error Inappropriate json string """ try: return json.loads(s) except ValueError as e: return False return True def read_json(self, pth): """ Reads a json file from system path or exits Parameters ---------- pth: str path of the file in the folder structure Returns ------- json : json json object """ if os.path.isfile(pth): with open(pth) as j: return json.load(j) else: self.debug.print_debug( self, self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID) sys.exit(1) def create_dirs_recursive(self, pth): """ Recursively create directories for a system path or exists if folder exists Parameters ---------- pth : str system path to be created """ p = '' for path in pth: p = p + os.path.sep + path.strip('/').strip('/') if not os.path.exists(p): try: os.makedirs(p) except OSError as o: print o sys.exit(1) return p def set_numbering_tags(self, tags, tr): """ Automatic numbering of the list of elements Parameters ---------- tags: list list of elements Returns ------- tr : elementtree """ for tag in tags: sh = tr.findall('.//' + tag) sid = 1 for i in sh: i.set('id', tag.replace('-', '') + str(sid)) sid += 1 return tr def check_program(self, p): """ Checks whether a the program or typesetter is installed and executable Parameters --------- p: str Program path Returns -------- None: bool Returns None , if program exists """ def is_exe(f_path): """ Checks whether path is available and executable Parameters --------- f_path: str File path Returns -------- boolean: bool True or False """ return os.path.isfile(f_path) and os.access(f_path, os.X_OK) fpath, fname = os.path.split(p) if fpath: if is_exe(p): return p else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, p) if is_exe(exe_file): return exe_file return None
class Process(Debuggable): """ Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers. Features -------- add Id numbering for any tag type, clean comments, remove unused references, set numbering, add unique ids to certain tag types, sort references """ def __init__(self): self.args = self.read_command_line() self.debug = Debug() self.gv = GV() Debuggable.__init__(self, 'Main') if self.args.get('--debug'): self.debug.enable_debug() self.dr = self.args.get('<path>') self.f = self.args.get('<input_file>') self.tr = etree.parse(os.path.join(self.dr, self.f)) @staticmethod def read_command_line(): """ Reads and generates a docopt dictionary from the command line parameters. Returns ------- docopt : dictionary A dictionary, where keys are names of command-line elements such as and values are theparsed values of those elements. """ return docopt(__doc__, version='xml 0.1') def remove_references(self): """ removes references, which are not linked. Parameters ----------- tag : str name of the XML tag Returns ------- tr : elementtree See Also -------- remove_element, remove_tags """ r = self.tr.getroot() for e in r.findall('.//back/ref-list/ref'): if e.attrib.get('id'): if r.find(".//xref[@ref-type='bibr'][@rid='" + e.attrib.get('id') + "']") is None: self.remove_element(e) else: self.remove_element(e) for e in r.findall(".//xref[@ref-type='bibr']"): if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') + "']") is None: if e.getparent() is not None: for c in e.getparent().getiterator(): if c.tag == 'xref' and c.attrib.get('ref-type') == 'bibr': self.remove_tags(c) return self.tr def remove_tags(self, e): """ Takes an etree element and replaces it with its own text Parameters ---------- e : element Element to be replaced """ if e.getparent() is not None: previous = e.getprevious() if previous is not None: if previous.tail: if e.text: previous.tail = previous.tail + e.text if e.tail: previous.tail = previous.tail + e.tail e.getparent().remove(e) def remove_element(self, e): """ Remove any element only if it has a parent Parameters ---------- e : element Element to be replaced """ if e.getparent() is not None: e.getparent().remove(e) def set_uuids_for_back_matter(self, tags): """ Add unique id tags to any of the sub-elements of the back matter Parameters ---------- tags: list list of elements Returns ------- tr : elementtree """ for s in tags: f = {} ref_type = 'bibr' if s == 'ref' else s fns = self.tr.getroot().findall( ''.join(['.//xref/[@ref-type="', ref_type, '"]'])) for i in fns: rid = ''.join(['bibd', uuid.uuid4().get_hex()]) f[i.attrib['rid']] = rid i.set('rid', rid) for m in f.keys(): n = self.tr.getroot().find( ''.join(['.//' + s + '/[@id="', m, '"]'])) if n is not None: n.set('id', f[m]) if len(n) > 0 else '' return self.tr def set_numbering_values( self, tag, attr, value, count, range_list): """ Adds numerical values to a tag in arguments list Parameters --------- tag: str xml tag name attr: str attribute name value :str value name count : int current sequence number range_list : list lower and upper level for the numbering See Also -------- set_roman_numbers """ searchTag = './/' + tag + '[@' + attr + '="' + value + '"]' elems = self.tr.getroot().findall(searchTag) range_count = 1 for elem in elems: elem.text, range_count = self.set_roman_numbers( count, range_count, range_list) count += 1 return self.tr, count def convert_int_to_roman(self, i): """ Converts an integer number into a roman number Parameters --------- i : int integer number Returns ------- result : str Roman number """ result = [] for integer, numeral in self.gv.numeral_map: count = i // integer result.append(numeral * count) i -= integer * count return ''.join(result) def set_roman_numbers(self, count, r_count, range_list): """ Converts a given set of elements defined by range_array into roman numbers Parameters --------- count :int r_count : int range_list : list lower and upper level for the numbering Returns ------- val : str r_count: int See Also -------- convert_int_to_roman """ val = str(count) if int(range_list[0]) <= count <= int(range_list[1]): val = self.convert_int_to_roman(r_count).lower() r_count += 1 else: val = str(count - r_count + 1) return val, r_count def merge_metadata(self, metadata): """ reads a metadata file path and merge its content into the metadata section Parameters ---------- metadata : str suffix of the metadata files Returns ------- tr : elementTree Element tree of the current file See Also ------- create_metadata_path """ r = self.tr.getroot() pth = self.create_metadata_path(metadata) if os.path.isfile(pth): fr = r.find('.//front') fr.getparent().remove(fr) bpm = etree.parse(pth).find('.//book-part-meta') bg = r.find('.//body').getparent() bg.insert(0, bpm) else: self.debug.print_debug(self, pth + self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST) return self.tr def create_metadata_path(self, metadata): """ creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata Parameters ---------- metadata : str Suffix of the metadata files Returns ------- pth : str Correct path of the metadata file in the folder structure Notes ----- We assume that metadata files are stored in a sub-folder named metadata """ p = os.path.dirname(self.f).split(os.sep) del p[-4:] f = os.path.basename(self.f) name, ext = os.path.splitext(f) file_name = [name, '.', metadata, ext] p.append('metadata') p.append(''.join(file_name)) pth = os.sep.join(p) return pth def sort_by_tags(self, tag_list, elem): """ Sorts a list of elements alphabetically Parameters ---------- tag_list : list A list of tag types elem : Element Element to be modified """ data = [] for e in elem: vl = [] for tag in tag_list: vl.append(e.findtext(".//" + tag)) vl.append(e) data.append(tuple(vl)) data.sort() elem[:] = [item[-1] for item in data] def sort_references(self, tag_list): """ Sort references based on the sub-elements list Parameters ---------- tag_list : list A list of tag types Returns ------- tr : elementTree Element tree of the current file See Also -------- sort_by_tags """ elem = self.tr.find('./back/ref-list') self.sort_by_tags(tag_list, elem) return self.tr def sort_footnotes(self, tag_list): """ Sort footnotes based on the sub-elements list Parameters ---------- tag_list : list A list of tag types Returns ------- tr : elementTree Element tree of the current file See Also -------- sort_by_tags """ elem = self.tr.find('./back/fn-group') self.sort_by_tags(tag_list, elem) return self.tr def process(self): """ Process JATS-XML file and do all transformations into the elementtree See Also -------- merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values """ clean_references = self.args.get('--clean-references') set_numbering_tags = self.args.get('--set-numbering-tags') set_unique_ids = self.args.get('--set-uuids') sort_footnotes = self.args.get('--sort-footnotes') sort_references = self.args.get('--sort-references') set_numbering_values = self.args.get('--set-numbering-values') metadata = self.args.get('--metadata') self.tr = self.merge_metadata(metadata) if metadata else self.tr self.tr = self.remove_references() if clean_references else self.tr self.tr = self.gv.set_numbering_tags(set_numbering_tags.split( ','), self.tr) if set_numbering_tags else self.tr self.tr = self.set_uuids_for_back_matter( set_unique_ids.split(',')) if set_unique_ids else self.tr self.tr = self.sort_footnotes( sort_footnotes.split(',')) if sort_footnotes else self.tr self.tr = self.sort_references( sort_references.split(',')) if sort_references else self.tr for s in set_numbering_values.split(';'): vals = s.split(',') count = 1 range_count = [0, 0] if len(vals) > 3: r = vals[3].lstrip('{').rstrip('}').split(':') range_count = [int(r[0]), int(r[1])] self.tr, count = self.set_numbering_values( vals[0], vals[1], vals[2], count, range_count) self.gv.create_dirs_recursive(self.dr.split('/')) self.create_xml_file( os.path.join( self.dr, os.path.basename( self.f))) def create_xml_file(self, pth): """ Write the current elementTree into the file path Parameters ---------- pth : str Correct path of the metadata file in the folder structure Raises ------ IOError I/O operation fails Notes ----- Default configuration writes a normalized XML file with XML scheme """ try: self.tr.write( pth, pretty_print=False, xml_declaration=True ) print except IOError as e: print e self.debug.print_debug(self, self.XML_FILE_NOT_CREATED) def run(self): """ Runs the configuration on the processing object See Also -------- process """ self.process()