Python SciDoc примеры использования

Язык программирования: Python

Пространство имен/Пакет: minerva.scidoc.scidoc

Класс/Тип: SciDoc

Примеров на hotexamples.com: 4

Python SciDoc - 4 примера найдено. Это лучшие примеры Python кода для minerva.scidoc.scidoc.SciDoc, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

addParagraph(2)

addSection(2)

updateAuthorsAffiliations(2)

updateReferences(2)

abstract(1)

addCitation(1)

addExistingReference(1)

addSentence(1)

loadExistingMetadata(1)

Пример #1

Показать файл

Файл: read_jatsxml.py Проект: danieldmm/minerva

    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]
        #xml=fixNumberCitationsXML(xml)
        soup=BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
        metadata["original_citation_style"]=detectCitationStyle(xml)

        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
            newDocument["metadata"]["guid"]=cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back=soup.find("back")
        if back:
            ref_list=back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup,newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument

Пример #2

Показать файл

Файл: athar_corpus.py Проект: danieldmm/minerva

    def wrapInSciDoc(self, contexts, doc_from_id, doc_to_id):
        """
            Returns a SciDoc ready to be passed to the standard context_extract
            functions, where each context is a paragraph

            Args:
                contexts: list of context dicts
                doc_from_id: corpus_id of this SciDoc
                doc_to_id: corpus_id of target document (citation)
            Returns:
                SciDoc
        """
        newDocument=SciDoc()
        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_from_id)
        if metadata:
            newDocument.loadExistingMetadata(metadata)
            assert newDocument.metadata["guid"] != ""
        else:
            newDocument.metadata["guid"]=doc_from_id
            assert newDocument.metadata["guid"] != ""

        newDocument.metadata["corpus_id"]=doc_from_id

        newSection_id=newDocument.addSection("root", "", 0)

        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_to_id)
        if not metadata:
            raise ValueError("Target document %s is not in corpus!" % doc_to_id)
            return

        ref=newDocument.addExistingReference(metadata)

        ref["corpus_id"]=doc_to_id

        for context in contexts:
            newPar_id=newDocument.addParagraph(newSection_id)
            for line in context["lines"]:
                newSent_id=newDocument.addSentence(newPar_id)
                text=line["text"]
                citations=[]
                if re.search(CIT_MARKER,text):
                    newCit=newDocument.addCitation(newSent_id, ref["id"])
                    text=re.sub(CIT_MARKER, CITATION_FORM % newCit["id"], text)
                    citations.append(newCit["id"])

                sent=newDocument.element_by_id[newSent_id]
                sent["sentiment"]=line["sentiment"]
                sent["text"]=text
                if len(citations) > 0:
                    sent["citations"]=citations

        return newDocument

Пример #3

Показать файл

Файл: read_paperxml.py Проект: danieldmm/minerva

    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml=self.cleanUpPaperXML(xml)
        soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
##        if not citation_style:
##            raise ValueError("Cannot determine citation style")
            # default citation style if not otherwise detected
##            citation_style="APA"
        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument


        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument,"error","NO TITLE file: "+identifier)
            return newDocument

        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references=body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
##        print (newDocument.references)
##        print("\n\n")
        sections=body.findChildren("section", recursive=False)

        detect_style_text="".join([sec.renderContents() for sec in sections[:3]])
##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style="APA"
        metadata["original_citation_style"]=citation_style

        # Load Abstract
        self.loadPaperAbstract(soup,newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument

Пример #4

Показать файл

Файл: azscixml.py Проект: danieldmm/minerva

def loadAZSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """

    # main loadSciXML
    text=loadFileText(filename)
    soup=BeautifulStoneSoup(text)

    fileno=soup.find("docno")
    fileno=fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument=SciDoc()
    newDocument["metadata"]["filename"]=os.path.basename(filename)
    newDocument["metadata"]["filepath"]=filename

    paper=soup.find("paper")
    if not paper:
        debugAddMessage(newDocument,"error","NO <PAPER> IN THIS PAPER! file: "+filename)
        return newDocument

    # Load metadata, either from corpus or from file
##    key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
##    if cp.Corpus.metadata_index.has_key(key):
##        metadata=cp.Corpus.metadata_index[key]
##    else:
    metadata=None

    if metadata:
        newDocument["metadata"]["conference"]=""
        for field in metadata:
            newDocument["metadata"][field]=metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno, soup)
##        debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename)

    newDocument["metadata"]["guid"]=cp.Corpus.generateGUID(newDocument["metadata"])

    # Clean up potential weird text in XML metadata
##    makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff

    # Load all references (at the end of the document) from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract=soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: "+ filename+"\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id=newDocument.addSection("root","Abstract")
        newPar_id=newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            addNewSentenceAndProcessRefs(s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence

        newDocument.abstract=newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

# "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
# No functiona por: unicode
##    for ref in newDocument["references"]:
##        k=ref.get("AZ",["NO AZ"])
##        print k, most_common(k)

    return newDocument