class AddArticlesToMetsFile( Step ):

    def setup(self): = 'Indsæt indholdsfortegnelse i METS metadata'
        self.config_main_section = 'add_articles_to_mets_file'
        self.essential_config_sections = set( ['process_folder_structure',
                                               'dbc'] )
        self.essential_commandlines = {
            'process_path' : 'folder'

    def step(self):
        except ValueError as e:
            return str(e) + str(traceback.format_exc())
        except IOError as e:
            return str(e) + str(traceback.format_exc())
        except Exception as e:
            return str(e) + str(traceback.format_exc())

    def getVariables(self):
        This method pulls in all the variables
        from the command line and the config file 
        that are necessary for its running.
        We need a path to our toc file, our meta.xml
        and a link to our DBC data service (eXist API).
        Errors in variables will lead to an 
        Exception being thrown.
        self.page_offset = None
        self.issnSet = False
        process_path = self.command_line.process_path
        toc_dir = os.path.join(
            self.getConfigItem('metadata_toc_path', section='process_folder_structure')
        toc_name = tools.getFirstFileWithExtension(toc_dir, '.toc')
        self.toc_file_path = os.path.join(toc_dir, toc_name)
        self.service_url = self.getConfigItem('dbc_service', section='dbc')
        self.meta_file = os.path.join(
            self.getConfigItem('metadata_goobi_file', section='process_files')
        # Parse initial Goobi METS file to a dictionary tree for processing
        self.meta_data,_ = dict_tools.parseXmlToDict(self.meta_file)
        # For pdf info
        pdf_input = self.getConfigItem('doc_limbpdf_path',
                                       section= 'process_folder_structure')
        self.pdf_input_dir = os.path.join(process_path, pdf_input)
        # parse for overlapping articles
        self.overlapping_articles = self.getSetting('overlapping_articles',
        # parse boolean from command line - for overlapping articles
        self.default_language = self.getSetting('default_language',

    def getDBCData(self, article_id):
        url = self.service_url.format(article_id)
        return MarcXml.initFromWeb(url)

    def getPdf(self):
        self.pdf_name = tools.getFirstFileWithExtension(self.pdf_input_dir, '.pdf')
        self.pdf_path = os.path.join(self.pdf_input_dir, self.pdf_name)
        self.pdfinfo = tools.pdfinfo(self.pdf_path)

    def parseTocFile(self):
        self.toc_data = TOC(self.toc_file_path,self.service_url,
                            self.pdfinfo, self.overlapping_articles,
        # Various checks of the toc-file
        data_check = self.toc_data.erroneousPages()
        if data_check == 1:
            msg = ('NB!!! Der er kun en artikel for hæftet. Flere kan oprettes '
                   ' via Goobis metadata-editor eller hæftet kan sendes '
                   ' LIMB igen. Suk!')
        elif data_check == 2:
            msg = ('NB!!! En fejl i LIMB har medført, at alle hæftets artikler '
                   'har samme startside. Dette skal rettes manuelt i '
                   'METS-editoren for hver enkelt artikel. Suk!')

    def writeXml(self):
        Write the xml generated back to file

    def isIssnSet(self):
        Goes through and metadata in dmd_sec in mets.xml file to see if the ISSN
        field has been set. If so set self.issnSet to True else to False. 
        self.issnSet = mets_tools.hasMetadataField(self.meta_data,'ISSN')
    def addIssnToPeriodical(self, issn):
        Adds the field ISSN to the doc stuct type PeriodicalVolume, if it isn't
        already set.
        :param issn:
        self.issnSet = mets_tools.addFieldToDocType(self.meta_data,

    def buildXml(self):
        Given a toc object consisting of articles with dbc ids
        use the DBC service to     generate data for each article.
        When all data is created, append this to the exising
        meta.xml data
        mets_tools.addOffsetToPhysicalStructMap(self.meta_data, self.toc_data.page_offset)
        self.meta_data = mets_tools.expandPagesFromChildrenToParent(self.meta_data)
    def createFrontMatterSection(self):
        articles = self.toc_data.getFrontMatterSection()
        if articles: self.createArticles(articles,'FrontMatter')
    def createArticlesSection(self):
        articles = self.toc_data.getArticlesSection()
        if articles: self.createArticles(articles,'Articles')
    def createBackMatterSection(self):
        articles = self.toc_data.getBackMatterSection()
        if articles: self.createArticles(articles,'BackMatter')
    def createArticles(self,section,section_type):
        if not mets_tools.docTypeExists(self.meta_data, section_type):
            # Create section, e.g. FrontMatter if it doesn't exists
            section_data = {'doc_type': section_type,'content': [{'name':'TitleDocMain','data':section_type}]}
            self.meta_data = mets_tools.addNewDocStruct(self.meta_data,section_data)
        articles = section.articles
        #TODO: Create articles-docstruct if not already there
        section_attrib = ('TYPE',section_type)
        for article in articles:
            article_data = self.createArticleData(article)
            if article_data and not self.articleExists(article_data):
                self.meta_data = mets_tools.addNewDocStruct(self.meta_data,
    def articleExists(self,article_data):
        article_title = [c['data'] for c in article_data['content']
                         if c['name'] == 'TitleDocMain'][0]
        start_page = article_data['start_page']
        end_page = article_data['end_page']
        if mets_tools.articleExists(self.meta_data, article_title,
            err = ('Article "{0}" already exist in METS-file. Possible '
                   'duplicate. Article is skipped.')
            err = err.format(article_title.encode('utf-8'))
            return True
        else: return False
    def createArticleData(self, article):
        Create a metadata structure that can be 
        consumed by the Meta XML builder class.
        This takes the form of a list of dictionaries,
        with each dictionary representing a field or set of fields.
        For example: [{'name': 'Abstract', 'data' : 'From the Roman Empire...' }, 
            {'name' : 'TitleDocMain', 'data' : 'Return of the oppressed'},
            {'name' : 'Author', 'type' : 'person', 'fields' : [
                {'tag' : 'goobi:firstName', 'data' : 'Peter'},
                {'tag' : 'goobi:lastName', 'data' : 'Turchin'}
        See the MetaXml class for more details.
        content = list()
        # Set language
        content.append({'name': 'DocLanguage', 'data':article.language})
        # Add title
        if article.article_id:
            content.append({'name': 'dbcMarcxID', 'data':article.article_id})
        # Add title, update time and sub title
        content.append({'name': 'TitleDocMain', 'data':article.title})
        if article.update_time:
            content.append({'name': 'UpdateTime', 'data':article.update_time})
        if article.sub_title:
            content.append({'name': 'TitleDocSub1', 'data':article.sub_title})
        # Add subjects
        for subject in article.subjects:
            content.append({'name': 'Subject', 'data':subject})
        # Add description and content description
        if article.description:
            content.append({'name': 'Description', 'data':article.description})
        if article.content_description:
            content.append({'name': 'ContentDescription', 'data':article.content_description})
        # Add start and endpage
        start_page = article.start_page
        end_page = article.end_page
        # Add authors
        if article.authors: # multiple authors or author from dbc-data
            for author in article.authors:
                given_name = author[0]
                family_name  = author[1]
                author_element = self.__createAuthorElement(given_name, family_name)
                if author_element: content.append(author_element)
            if len(' ')) > 1: # multiple names
                given_name, family_name =' ',1) 
                given_name = 
                family_name = ''
            author_element = self.__createAuthorElement(given_name, family_name)
            if author_element: content.append(author_element)
            given_name = ''
            family_name = ''
            author_element = self.__createAuthorElement(given_name, family_name)
            if author_element: content.append(author_element)
            # TODO: Do check on the numbers of author names. If very long
            # raise a note to quality control. This is a larger implementation.
            # An example of a long author field:
            #    "Else Marie Pedersen i samarbejde med I�rn Pi� og Holger Rasmussen"
        # create elements for any other authors
        # TODO: routine to split up author field - e.g. use ';' to separate
        # authors.
        # Add issn if it isnt set. ISSN lives in the metadata for the issue
        # but comes from articles in DBCs metadata. Thus we give the issue 
        # the ISSN from the article, if it isn't previously set. 
        if not self.issnSet and article.issn:
        # Join article element to be added to mets-file 
        article_data = {'doc_type': 'Article',
                        'content': content,
                        'start_page': start_page,
                        'end_page': end_page}
        return article_data

    def __createAuthorElement(self, firstname, lastname):
        Given a firstname and a lastname, create 
        a list of dictionaries representing a single author
        in the following form:
        [{'tag' : 'firstName', 'data' : 'Peter'},
        {'tag' : 'lastName', 'data' : 'Turchin'}]
        If firstname and lastName are empty, it will return
        an empty hash
        author = dict()
        author['name'] = 'Author'
        author['type'] = 'person'
        author_fields = list()
        if firstname:
            firstname_elem = dict()
            firstname_elem['tag'] = 'firstName'
            firstname_elem['data'] = firstname
        if lastname:
            lastname_elem = dict()
            lastname_elem['tag'] = 'lastName'
            lastname_elem['data'] = lastname
        # build the best display name we can, given the 
        # data available to us
        display_name = dict()
        display_name['tag'] = 'displayName'
        if firstname and lastname:
            display_name['data'] = u"{0}, {1}".format(lastname, firstname)
        elif lastname:
            display_name['data'] = lastname
        elif firstname:
            display_name['data'] = firstname

        if author_fields:
            author['fields'] = author_fields
            return author
            return None