Пример #1
0
    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]
        #xml=fixNumberCitationsXML(xml)
        soup=BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
        metadata["original_citation_style"]=detectCitationStyle(xml)

        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
            newDocument["metadata"]["guid"]=cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back=soup.find("back")
        if back:
            ref_list=back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup,newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument
Пример #2
0
    def wrapInSciDoc(self, contexts, doc_from_id, doc_to_id):
        """
            Returns a SciDoc ready to be passed to the standard context_extract
            functions, where each context is a paragraph

            Args:
                contexts: list of context dicts
                doc_from_id: corpus_id of this SciDoc
                doc_to_id: corpus_id of target document (citation)
            Returns:
                SciDoc
        """
        newDocument=SciDoc()
        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_from_id)
        if metadata:
            newDocument.loadExistingMetadata(metadata)
            assert newDocument.metadata["guid"] != ""
        else:
            newDocument.metadata["guid"]=doc_from_id
            assert newDocument.metadata["guid"] != ""

        newDocument.metadata["corpus_id"]=doc_from_id

        newSection_id=newDocument.addSection("root", "", 0)

        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_to_id)
        if not metadata:
            raise ValueError("Target document %s is not in corpus!" % doc_to_id)
            return

        ref=newDocument.addExistingReference(metadata)

        ref["corpus_id"]=doc_to_id

        for context in contexts:
            newPar_id=newDocument.addParagraph(newSection_id)
            for line in context["lines"]:
                newSent_id=newDocument.addSentence(newPar_id)
                text=line["text"]
                citations=[]
                if re.search(CIT_MARKER,text):
                    newCit=newDocument.addCitation(newSent_id, ref["id"])
                    text=re.sub(CIT_MARKER, CITATION_FORM % newCit["id"], text)
                    citations.append(newCit["id"])

                sent=newDocument.element_by_id[newSent_id]
                sent["sentiment"]=line["sentiment"]
                sent["text"]=text
                if len(citations) > 0:
                    sent["citations"]=citations

        return newDocument
Пример #3
0
    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml=self.cleanUpPaperXML(xml)
        soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
##        if not citation_style:
##            raise ValueError("Cannot determine citation style")
            # default citation style if not otherwise detected
##            citation_style="APA"
        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument


        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument,"error","NO TITLE file: "+identifier)
            return newDocument

        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references=body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
##        print (newDocument.references)
##        print("\n\n")
        sections=body.findChildren("section", recursive=False)

        detect_style_text="".join([sec.renderContents() for sec in sections[:3]])
##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style="APA"
        metadata["original_citation_style"]=citation_style

        # Load Abstract
        self.loadPaperAbstract(soup,newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument
Пример #4
0
def loadAZSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """

    # main loadSciXML
    text=loadFileText(filename)
    soup=BeautifulStoneSoup(text)

    fileno=soup.find("docno")
    fileno=fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument=SciDoc()
    newDocument["metadata"]["filename"]=os.path.basename(filename)
    newDocument["metadata"]["filepath"]=filename

    paper=soup.find("paper")
    if not paper:
        debugAddMessage(newDocument,"error","NO <PAPER> IN THIS PAPER! file: "+filename)
        return newDocument

    # Load metadata, either from corpus or from file
##    key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
##    if cp.Corpus.metadata_index.has_key(key):
##        metadata=cp.Corpus.metadata_index[key]
##    else:
    metadata=None

    if metadata:
        newDocument["metadata"]["conference"]=""
        for field in metadata:
            newDocument["metadata"][field]=metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno, soup)
##        debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename)

    newDocument["metadata"]["guid"]=cp.Corpus.generateGUID(newDocument["metadata"])

    # Clean up potential weird text in XML metadata
##    makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff

    # Load all references (at the end of the document) from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract=soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: "+ filename+"\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id=newDocument.addSection("root","Abstract")
        newPar_id=newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            addNewSentenceAndProcessRefs(s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence

        newDocument.abstract=newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

# "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
# No functiona por: unicode
##    for ref in newDocument["references"]:
##        k=ref.get("AZ",["NO AZ"])
##        print k, most_common(k)

    return newDocument