def convert_address_linking_elements(self, top): """ The Journal Publishing Tag Set defines the following elements as address linking elements: <email>, <ext-link>, <uri>. The only appropriate hypertext element for linking in OPS is the <a> element. """ #Convert email to a mailto link addressed to the text it contains for email in top.findall('.//email'): element_methods.remove_all_attributes(email) email.tag = 'a' email.attrib['href'] = 'mailto:{0}'.format(email.text) #Ext-links often declare their address as xlink:href attribute #if that fails, direct the link to the contained text for ext_link in top.findall('.//ext-link'): ext_link.tag = 'a' xlink_href_name = element_methods.ns_format(ext_link, 'xlink:href') xlink_href = element_methods.get_attribute(ext_link, xlink_href_name) element_methods.remove_all_attributes(ext_link, exclude=['id']) if xlink_href: ext_link.attrib['href'] = xlink_href else: ext_link.attrib['href'] = element_methods.all_text(ext_link) #Uris often declare their address as xlink:href attribute #if that fails, direct the link to the contained text for uri in top.findall('.//uri'): uri.tag = 'a' xlink_href_name = element_methods.ns_format(uri, 'xlink:href') xlink_href = element_methods.get_attribute(uri, xlink_href_name) element_methods.remove_all_attributes(uri) if xlink_href: uri.attrib['href'] = xlink_href else: uri.attrib['href'] = element_methods.all_text(uri)
def frontiers_dc_date(article): """ Given an Article class instance, this provides the method for extracting important dates in the history of the article. These are returned as a list of Date(year, month, day, event). This method looks specifically to locate the dates when Frontiers accepted the article and when it was published online. """ date_list = [] history = article.metadata.front.article_meta.history if history is None: return date_list #Creation is a Dublin Core event value: I interpret it as the date of acceptance #For some reason, the lxml dtd parser fails to recognize the content model #history (something to do with expanded content model? I am not sure yet) #So for now, this will illustrate a work-around using lxml search for date in history.node.findall('date'): if not 'date-type' in date.attrib: continue if date.attrib['date-type'] == 'accepted': year_el = date.find('year') month_el = date.find('month') day_el = date.find('day') if year_el is not None: year = element_methods.all_text(year_el) else: year = '' if month_el is not None: month = element_methods.all_text(month_el) else: month = '' if day_el is not None: day = element_methods.all_text(day_el) date_list.append(date_tup(year, month, day, 'creation')) #Publication is another Dublin Core event value: I use date of epub pub_dates = article.metadata.front.article_meta.pub_date for pub_date in pub_dates: if pub_date.attrs['pub-type'] == 'epub': date_list.append(date_tup(pub_date.year.text, pub_date.month.text, pub_date.day.text, 'publication')) return date_list
def recursive_article_navmap(self, src_element, depth=0, first=True): """ This function recursively traverses the content of an input article to add the correct elements to the NCX file's navMap and Lists. """ if depth > self.nav_depth: self.nav_depth = depth navpoints = [] tagnames = ['sec', 'fig', 'table-wrap'] for child in src_element: try: tagname = child.tag except AttributeError: continue else: if tagname not in tagnames: continue #Safely handle missing id attributes if 'id' not in child.attrib: child.attrib['id'] = self.auto_id #If in collection mode, we'll prepend the article DOI to avoid #collisions if self.collection: child_id = '-'.join([self.article_doi, child.attrib['id']]) else: child_id = child.attrib['id'] #Attempt to infer the correct text as a label #Skip the element if we cannot child_title = child.find('title') if child_title is None: continue # If there is no immediate title, skip this element label = element_methods.all_text(child_title) if not label: continue # If no text in the title, skip this element source = 'main.{0}.xhtml#{1}'.format(self.article_doi, child.attrib['id']) if tagname == 'sec': children = self.recursive_article_navmap(child, depth=depth + 1) navpoints.append( navpoint(child_id, label, self.play_order, source, children)) #figs and table-wraps do not have children elif tagname == 'fig': # Add navpoints to list_of_figures self.figures_list.append( navpoint(child.attrib['id'], label, None, source, [])) elif tagname == 'table-wrap': # Add navpoints to list_of_tables self.tables_list.append( navpoint(child.attrib['id'], label, None, source, [])) return navpoints
def recursive_article_navmap(self, src_element, depth=0, first=True): """ This function recursively traverses the content of an input article to add the correct elements to the NCX file's navMap and Lists. """ #TODO: This may need modification for non JPTS if depth > self.maxdepth: self.maxdepth = depth navpoints = [] tagnames = ['sec', 'fig', 'table-wrap'] for child in src_element: try: tagname = child.tag except AttributeError: # Text nodes have no attribute tagName continue else: if tagname not in tagnames: continue source_id = child.attrib['id'] #In single mode, use the id as it is if not self.collection_mode: child_id = source_id #If in collection_mode, prepend the article_doi to avoid collisions else: child_id = '{0}-{1}'.format(self.article_doi, source_id) #Attempt to pull the title text as a label for the navpoint child_title = child.find('title') if child_title is None: continue label = element_methods.all_text(child_title) if not label: continue source = 'main.{0}.xml#{1}'.format(self.article_doi, source_id) if tagname == 'sec': play_order = self.pull_play_order() children = self.recursive_article_navmap(child, depth=depth+1) new_nav = navpoint(child_id, label, play_order, source, children) navpoints.append(new_nav) #figs and table-wraps do not have children elif tagname == 'fig': # Add navpoints to list_of_figures new_nav = navtarget(child_id, label, source) self.list_of_figures.append(new_nav) elif tagname == 'table-wrap': # Add navpoints to list_of_tables new_nav = navtarget(child_id, label, source) self.list_of_tables.append(new_nav) return navpoints
def recursive_element_packing(element): if element is None: return None tagname = element.tag element_def = dtd_dict[tagname] #Create lists for field names and field values field_names = [] field_vals = [] #Create a self reference, named node, value is the element itself field_names.append('node') field_vals.append(element) #Handle attributes attrs = {} # Dict to hold attributes field_names.append('attrs') # namedtuple attribute to receive dict #Compose the attrs dict with appropriate keys and values for attribute in element_def.iterattributes(): if attribute.prefix: if attribute.prefix == 'xmlns': # Pseudo-attribute continue elif attribute.prefix == 'xml': attr_lookup = '{{http://www.w3.org/XML/1998/namespace}}{0}'.format(attribute.name) else: attr_lookup = '{'+element.nsmap[attribute.prefix]+'}'+attribute.name key = '{0}:{1}'.format(attribute.prefix, attribute.name) else: key = attribute.name attr_lookup = key #Add the value of the attribute to list of field values try: value = element.attrib[attr_lookup] except KeyError: attrs[key] = None # Not worrying about implied defaults right now #field_vals.append(None else: attrs[key] = value #Add the attrs dict to field values field_vals.append(attrs) #Get the sub_elements for the element sub_elements = get_sub_elements(element_def.content, first=True) get_text = False # A control variable, used later if PCDATA in content model for sub_element in sub_elements: #We have the sub elements according to tag and occurrence if sub_element.tag == 'pcdata': get_text = True continue if sub_element.occurrence == 'multiple': child_tag = sub_element.tag child_list = [] for each in element.findall(child_tag): child_list.append(recursive_element_packing(each)) field_names.append(child_tag) field_vals.append(child_list) else: child_tag = sub_element.tag child_element = element.find(child_tag) if child_element is not None: child = recursive_element_packing(child_element) else: child = None field_names.append(child_tag) field_vals.append(child) if get_text: field_names.append('text') field_vals.append(element_methods.all_text(element)) #Make items in field_names safe for namedtuple #Coerce characters in string field_names = [coerce_string(i) for i in field_names] #Prepend 'l' to reserved keywords for element tagname if iskeyword(tagname): tagname = 'l' + tagname #Prepend 'l' to reserved keywords for sub_elements field_names = ['l'+i if iskeyword(i) else i for i in field_names] data_tuple = namedtuple(coerce_string(tagname), ', '.join(field_names)) return data_tuple(*field_vals)
def recursive_article_navmap(self, src_element, depth=0, first=True): """ This function recursively traverses the content of an input article to add the correct elements to the NCX file's navMap and Lists. """ if depth > self.nav_depth: self.nav_depth = depth navpoints = [] tagnames = ['sec', 'fig', 'table-wrap'] for child in src_element: try: tagname = child.tag except AttributeError: continue else: if tagname not in tagnames: continue #Safely handle missing id attributes if 'id' not in child.attrib: child.attrib['id'] = self.auto_id #If in collection mode, we'll prepend the article DOI to avoid #collisions if self.collection: child_id = '-'.join([self.article_doi, child.attrib['id']]) else: child_id = child.attrib['id'] #Attempt to infer the correct text as a label #Skip the element if we cannot child_title = child.find('title') if child_title is None: continue # If there is no immediate title, skip this element label = element_methods.all_text(child_title) if not label: continue # If no text in the title, skip this element source = 'main.{0}.xhtml#{1}'.format(self.article_doi, child.attrib['id']) if tagname == 'sec': children = self.recursive_article_navmap(child, depth=depth + 1) navpoints.append(navpoint(child_id, label, self.play_order, source, children)) #figs and table-wraps do not have children elif tagname == 'fig': # Add navpoints to list_of_figures self.figures_list.append(navpoint(child.attrib['id'], label, None, source, [])) elif tagname == 'table-wrap': # Add navpoints to list_of_tables self.tables_list.append(navpoint(child.attrib['id'], label, None, source, [])) return navpoints