示例#1
0
 def repl_func(m):
     """
         Replaces <CIT ID=0 /> with <REF ID=cit0 REFID=ref0 />
     """
     cit_id=m.group(1) # this is the actual unique identifier of a citation in the document
     ref_id=doc.citation_by_id[cit_id]["ref_id"] # this is the id of the reference the cit cites
     return u'<REF ID="'+safe_unicode(cit_id)+u'" REFID="'+safe_unicode(ref_id)+'" />'
示例#2
0
 def repl_func(m):
     """
         Replaces <CIT ID=0 /> with <REF ID=cit0 REFID=ref0 />
     """
     cit_id = m.group(
         1
     )  # this is the actual unique identifier of a citation in the document
     ref_id = doc.citation_by_id[cit_id][
         "ref_id"]  # this is the id of the reference the cit cites
     return u'<REF ID="' + safe_unicode(
         cit_id) + u'" REFID="' + safe_unicode(ref_id) + '" />'
示例#3
0
def saveSapientaXML(doc,filename):
    """
        Exports a SciDocJSON to a file in Sapienta sort-of SciXML format
    """
    header_depth=0

    def dumpMetadata(doc):
        """
        """
        lines=[]
        lines.append('<mode2 hasDoc="yes" name="'+doc.metadata["filename"].replace(".xml","")+'" version="597"/>')
        lines.append("<METADATA>")
        if doc["metadata"].has_key("fileno"):
            lines.append("<FILENO>"+doc.metadata["fileno"]+"</FILENO>\n")

        lines.append("<FILENAME>"+doc.metadata["filename"]+"</FILENAME>\n")

        lines.append("<APPEARED>")
        if "conference" in doc.metadata:
            lines.append("<CONFERENCE>"+doc.metadata["conference"]+"</CONFERENCE>\n")
        if "journal" in doc.metadata:
            lines.append("<JOURNAL>"+doc.metadata["journal"]+"</JOURNAL>\n")

        lines.append(u"<CURRENT_AUTHORLIST>")
        for author in doc.metadata["authors"]:
            bits=author.split(",")
            rest=" ".join(bits[1:])
            lines.append(u"<CURRENT_AUTHOR>")
##            lines.append("<FIRSTNAME>%s</FIRSTNAME>" % rest)
            rest=fixAmpersandsHTML(rest+u" ")
            bits[0]=fixAmpersandsHTML(bits[0]+u" ").strip()
            lines.append(rest+u"<CURRENT_SURNAME>%s</CURRENT_SURNAME>" % bits[0])
            lines.append(u"</CURRENT_AUTHOR>")
        lines.append(u"</CURRENT_AUTHORLIST>")

##        lines.append("<SURNAMES>")
##        for author in doc["authors"]:
##
##        lines.append("</SURNAMES>")

        lines.append(u"<YEAR>"+doc.metadata["year"]+"</YEAR>")
        lines.append(u"</APPEARED>")

        if doc["metadata"].has_key("revisionhistory"):
            lines.append(u"<REVISIONHISTORY>"+doc.metadata["revisionhistory"]+u"</REVISIONHISTORY>\n")
        lines.append(u"</METADATA>")
        return lines

    def dumpAbstract(doc):
        """
        """
        lines=[]
        lines.append(u"<ABSTRACT>")
        if "content" in doc.abstract:
            for element_id in doc.abstract["content"]:
                s=doc.element_by_id[element_id]
                if s["type"]=="s":
                    lines.append(dumpSentence(doc,s, True))
                if s["type"]=="p":
                    lines.extend(dumpParagraph(doc,s, True))
        lines.append(u"</ABSTRACT>")
        return lines


    def dumpSentence(doc,s,in_abstract=False):
        """
            Fixes all the contents of the sentence, returns a string of proper XML
        """

        def repl_func(m):
            """
                Replaces <CIT ID=0 /> with <REF ID=cit0 REFID=ref0 />
            """
            cit_id=m.group(1) # this is the actual unique identifier of a citation in the document
            ref_id=doc.citation_by_id[cit_id]["ref_id"] # this is the id of the reference the cit cites
            return u'<REF ID="'+safe_unicode(cit_id)+u'" REFID="'+safe_unicode(ref_id)+'" />'

        in_abstract=False # hack because of sapienta difference vs SciXML
        if in_abstract:
            xmltag=u"A-S"
        else:
            xmltag=u"s"

        s_id=re.match(r"\w+?(\d+)",s["id"],re.IGNORECASE)
        assert(s_id)
        res=u"<"+xmltag+u' sid="%s"' % str(int(s_id.group(1)))

        # !TODO change this to export more markers, such as CFC and whatever
        if "az" in s:
            res+=u'AZ="%s"' % s["az"]

        text=s["text"]
        text=re.sub(r"<CIT ID=(.*?)\s?/>",repl_func, text)

        res+=">"+text+"</"+xmltag+">"

        return res

    def dumpParagraph(doc,paragraph, in_abstract=False):
        """
        """
        lines=[]

        # fix for empty paragraphs: should it be here?
        if len(paragraph["content"]) == 0:
            return lines

        if not in_abstract:
            lines.append(u"<P>")
        for element in paragraph["content"]:
            element=doc.element_by_id[element]
            if element["type"]=="s":
                lines.append(dumpSentence(doc,element, in_abstract))

        if not in_abstract:
            lines.append(u"</P>")
        return lines

    def dumpSection(doc,section, header_depth):
        """
        """
        header_depth+=1
        lines=[]
        lines.append(u'<DIV depth="'+str(header_depth)+'">')
        lines.append(u"<HEADER>%s</HEADER>" % section["header"])
        for element in section["content"]:
            element=doc.element_by_id[element]
            if element["type"]=="section":
                lines.extend(dumpSection(doc,element, header_depth))
            elif element["type"]=="p":
                lines.extend(dumpParagraph(doc,element))

        lines.append(u"</DIV>")
        header_depth-=1
        return lines

    def dumpBody(doc):
        """
        """
        lines=[]
        lines.append(u"<BODY>")
        for section in doc.allsections:
            if section != doc.abstract:
                lines.extend(dumpSection(doc,section,0))
        lines.append(u"</BODY>")
        return lines

    def dumpRefAuthor(doc,author):
        """
        """
        lines=[]
        lines.append(u"<AUTHOR>")
        if isinstance(author,basestring):
            lines.append(author)
        elif isinstance(author,dict):
            lines.append(author["given"])
            lines.append(u"<SURNAME>%s</SURNAME>" % author["family"])
        lines.append(u"</AUTHOR>")
        return lines

    def dumpRefAuthors(doc,ref):
        """
        """
        lines=[]
        lines.append("<AUTHORLIST>")
        for author in ref["authors"]:
            lines.extend(dumpRefAuthor(doc,author))
        lines.append("</AUTHORLIST>")
        return lines

    def dumpReference(doc,ref):
        """
        <REFERENCE ID="cit1">
        <AUTHORLIST>
        <AUTHOR>L.<SURNAME>Que, Jr.</SURNAME></AUTHOR>
        <AUTHOR>R. Y. N.<SURNAME>Ho</SURNAME></AUTHOR>
        </AUTHORLIST>
        <TITLE></TITLE>
        <JOURNAL>Chem. Rev.<YEAR>1996</YEAR>962607-2624</JOURNAL></REFERENCE>
        """
        lines=[]
        lines.append('<REFERENCE ID="%s">' % ref["id"])
        lines.extend(dumpRefAuthors(doc,ref))
        lines.append("<TITLE>%s</TITLE>" % ref["title"])
        lines.append("<YEAR>%s</YEAR>" % ref["year"])
        if "journal" in ref:
            lines.append("<JOURNAL>%s</JOURNAL>" % ref["journal"])
        lines.append("</REFERENCE>")
        return lines

    def dumpReferences(doc):
        """
        """
        lines=[]
        lines.append("<REFERENCELIST>")
        for ref in doc.data["references"]:
            lines.extend(dumpReference(doc,ref))
        lines.append("</REFERENCELIST>")
        return lines

    lines=[]


    lines.append(u'<?xml version="1.0" encoding="UTF-8" standalone="no"?>')
##    lines.append('<!DOCTYPE PAPER SYSTEM "paper-structure-annotation.dtd">')
    lines.append(u"<PAPER>")
    lines.extend(dumpMetadata(doc))
    lines.extend(dumpAbstract(doc))
    lines.extend(dumpBody(doc))
    lines.extend(dumpReferences(doc))
    lines.append(u"</PAPER>")

    lines2=[]
    for line in lines:
        lines2.append(safe_unicode(line))
    lines=lines2

##    file_str = StringIO()
##    file_str.writelines(lines)
##    full_str=file_str.getvalue()
    full_str=u"".join(lines)
##    full_str=fixAmpersandsHTML(full_str)

    f=codecs.open(filename,"w", encoding="utf-8",errors="ignore")
    f.write(full_str)
    f.close()
示例#4
0
def saveSapientaXML(doc, filename):
    """
        Exports a SciDocJSON to a file in Sapienta sort-of SciXML format
    """
    header_depth = 0

    def dumpMetadata(doc):
        """
        """
        lines = []
        lines.append('<mode2 hasDoc="yes" name="' +
                     doc.metadata["filename"].replace(".xml", "") +
                     '" version="597"/>')
        lines.append("<METADATA>")
        if "fileno" in doc["metadata"]:
            lines.append("<FILENO>" + doc.metadata["fileno"] + "</FILENO>\n")

        lines.append("<FILENAME>" + doc.metadata["filename"] + "</FILENAME>\n")

        lines.append("<APPEARED>")
        if "conference" in doc.metadata:
            lines.append("<CONFERENCE>" + doc.metadata["conference"] +
                         "</CONFERENCE>\n")
        if "journal" in doc.metadata:
            lines.append("<JOURNAL>" + doc.metadata["journal"] +
                         "</JOURNAL>\n")

        lines.append(u"<CURRENT_AUTHORLIST>")
        for author in doc.metadata["authors"]:
            bits = author.split(",")
            rest = " ".join(bits[1:])
            lines.append(u"<CURRENT_AUTHOR>")
            ##            lines.append("<FIRSTNAME>%s</FIRSTNAME>" % rest)
            rest = fixAmpersandsHTML(rest + u" ")
            bits[0] = fixAmpersandsHTML(bits[0] + u" ").strip()
            lines.append(rest +
                         u"<CURRENT_SURNAME>%s</CURRENT_SURNAME>" % bits[0])
            lines.append(u"</CURRENT_AUTHOR>")
        lines.append(u"</CURRENT_AUTHORLIST>")

        ##        lines.append("<SURNAMES>")
        ##        for author in doc["authors"]:
        ##
        ##        lines.append("</SURNAMES>")

        lines.append(u"<YEAR>" + doc.metadata["year"] + "</YEAR>")
        lines.append(u"</APPEARED>")

        if "revisionhistory" in doc["metadata"]:
            lines.append(u"<REVISIONHISTORY>" +
                         doc.metadata["revisionhistory"] +
                         u"</REVISIONHISTORY>\n")
        lines.append(u"</METADATA>")
        return lines

    def dumpAbstract(doc):
        """
        """
        lines = []
        lines.append(u"<ABSTRACT>")
        if "content" in doc.abstract:
            for element_id in doc.abstract["content"]:
                s = doc.element_by_id[element_id]
                if s["type"] == "s":
                    lines.append(dumpSentence(doc, s, True))
                if s["type"] == "p":
                    lines.extend(dumpParagraph(doc, s, True))
        lines.append(u"</ABSTRACT>")
        return lines

    def dumpSentence(doc, s, in_abstract=False):
        """
            Fixes all the contents of the sentence, returns a string of proper XML
        """
        def repl_func(m):
            """
                Replaces <CIT ID=0 /> with <REF ID=cit0 REFID=ref0 />
            """
            cit_id = m.group(
                1
            )  # this is the actual unique identifier of a citation in the document
            ref_id = doc.citation_by_id[cit_id][
                "ref_id"]  # this is the id of the reference the cit cites
            return u'<REF ID="' + safe_unicode(
                cit_id) + u'" REFID="' + safe_unicode(ref_id) + '" />'

        in_abstract = False  # hack because of sapienta difference vs SciXML
        if in_abstract:
            xmltag = u"A-S"
        else:
            xmltag = u"s"

        s_id = re.match(r"\w+?(\d+)", s["id"], re.IGNORECASE)
        assert (s_id)
        res = u"<" + xmltag + u' sid="%s"' % str(int(s_id.group(1)))

        # !TODO change this to export more markers, such as CFC and whatever
        if "az" in s:
            res += u'AZ="%s"' % s["az"]

        text = s["text"]
        text = re.sub(r"<CIT ID=(.*?)\s?/>", repl_func, text)

        res += ">" + text + "</" + xmltag + ">"

        return res

    def dumpParagraph(doc, paragraph, in_abstract=False):
        """
        """
        lines = []

        # fix for empty paragraphs: should it be here?
        if len(paragraph["content"]) == 0:
            return lines

        if not in_abstract:
            lines.append(u"<P>")
        for element in paragraph["content"]:
            element = doc.element_by_id[element]
            if element["type"] == "s":
                lines.append(dumpSentence(doc, element, in_abstract))

        if not in_abstract:
            lines.append(u"</P>")
        return lines

    def dumpSection(doc, section, header_depth):
        """
        """
        header_depth += 1
        lines = []
        lines.append(u'<DIV depth="' + str(header_depth) + '">')
        lines.append(u"<HEADER>%s</HEADER>" % section["header"])
        for element in section["content"]:
            element = doc.element_by_id[element]
            if element["type"] == "section":
                lines.extend(dumpSection(doc, element, header_depth))
            elif element["type"] == "p":
                lines.extend(dumpParagraph(doc, element))

        lines.append(u"</DIV>")
        header_depth -= 1
        return lines

    def dumpBody(doc):
        """
        """
        lines = []
        lines.append(u"<BODY>")
        for section in doc.allsections:
            if section != doc.abstract:
                lines.extend(dumpSection(doc, section, 0))
        lines.append(u"</BODY>")
        return lines

    def dumpRefAuthor(doc, author):
        """
        """
        lines = []
        lines.append(u"<AUTHOR>")
        if isinstance(author, six.string_types):
            lines.append(author)
        elif isinstance(author, dict):
            lines.append(author["given"])
            lines.append(u"<SURNAME>%s</SURNAME>" % author["family"])
        lines.append(u"</AUTHOR>")
        return lines

    def dumpRefAuthors(doc, ref):
        """
        """
        lines = []
        lines.append("<AUTHORLIST>")
        for author in ref["authors"]:
            lines.extend(dumpRefAuthor(doc, author))
        lines.append("</AUTHORLIST>")
        return lines

    def dumpReference(doc, ref):
        """
        <REFERENCE ID="cit1">
        <AUTHORLIST>
        <AUTHOR>L.<SURNAME>Que, Jr.</SURNAME></AUTHOR>
        <AUTHOR>R. Y. N.<SURNAME>Ho</SURNAME></AUTHOR>
        </AUTHORLIST>
        <TITLE></TITLE>
        <JOURNAL>Chem. Rev.<YEAR>1996</YEAR>962607-2624</JOURNAL></REFERENCE>
        """
        lines = []
        lines.append('<REFERENCE ID="%s">' % ref["id"])
        lines.extend(dumpRefAuthors(doc, ref))
        lines.append("<TITLE>%s</TITLE>" % ref["title"])
        lines.append("<YEAR>%s</YEAR>" % ref["year"])
        if "journal" in ref:
            lines.append("<JOURNAL>%s</JOURNAL>" % ref["journal"])
        lines.append("</REFERENCE>")
        return lines

    def dumpReferences(doc):
        """
        """
        lines = []
        lines.append("<REFERENCELIST>")
        for ref in doc.data["references"]:
            lines.extend(dumpReference(doc, ref))
        lines.append("</REFERENCELIST>")
        return lines

    lines = []

    lines.append(u'<?xml version="1.0" encoding="UTF-8" standalone="no"?>')
    ##    lines.append('<!DOCTYPE PAPER SYSTEM "paper-structure-annotation.dtd">')
    lines.append(u"<PAPER>")
    lines.extend(dumpMetadata(doc))
    lines.extend(dumpAbstract(doc))
    lines.extend(dumpBody(doc))
    lines.extend(dumpReferences(doc))
    lines.append(u"</PAPER>")

    lines2 = []
    for line in lines:
        lines2.append(safe_unicode(line))
    lines = lines2

    ##    file_str = StringIO()
    ##    file_str.writelines(lines)
    ##    full_str=file_str.getvalue()
    full_str = u"".join(lines)
    ##    full_str=fixAmpersandsHTML(full_str)

    f = codecs.open(filename, "w", encoding="utf-8", errors="ignore")
    f.write(full_str)
    f.close()