예제 #1
0
def fix_broken_scidocs():
    """
        Iterates through the papers already in the collection. Tries to load
        their scidoc. If KeyError occurs, it loads the XML again
    """
    cp.useElasticCorpus()
    import minerva.squad.celery_app as celery_app
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc",
            endpoint={"host":celery_app.MINERVA_ELASTICSEARCH_SERVER_IP,
            "port":celery_app.MINERVA_ELASTICSEARCH_SERVER_PORT})
    importer=CorpusImporter("PMC_CSC","initial", use_celery=True)
    importer.generate_corpus_id=getPMC_CSC_corpus_id
    importer.reloadSciDocsOnly("metadata.collection_id:\"PMC_CSC\"",
        "g:\\nlp\\phd\\pmc_coresc\\inputXML", "*.xml")
예제 #2
0
def import_sapienta_pmc_corpus():
    """
        Do the importing of the Sapienta-annotated PMC corpus
    """
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    importer=CorpusImporter(reader=SapientaJATSXMLReader())
    importer.collection_id="PMC_CSC"
    importer.import_id="initial"
    importer.generate_corpus_id=getPMC_CSC_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)

    options={
##        "reload_xml_if_doc_in_collection": True, # default: False
##        "list_missing_references":True, # default: False
        "convert_and_import_docs":False, # default: True
##        "update_doc_references":False, # default: True
        "force_generate_resolvable_citations":True, # default: False

        "force_collection_id":"PMC_CSC", # this sets the collection_id again after updating references. Used for fixes
        "force_import_id":"fixed",
    }

##    corpus_import.FILES_TO_PROCESS_FROM=4500
##    corpus_import.FILES_TO_PROCESS_TO=10

##    importer.restartCollectionImport(options)

    importer.use_celery = True
    importer.importCorpus("g:\\nlp\\phd\\pmc_coresc\\inputXML", file_mask="*.xml", import_options=options)
예제 #3
0
def import_aac_corpus():
    """
        Do the importing of the AAC corpus
    """
    from minerva.squad.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    importer=CorpusImporter(reader=PaperXMLReader())
    importer.collection_id="AAC"
    importer.import_id="initial"
    importer.generate_corpus_id=getACL_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)

    options={
##        "list_missing_references":True, # default: False
        "convert_and_import_docs":False, # default: True
    }

##    corpus_import.FILES_TO_PROCESS_FROM=10222
##    corpus_import.FILES_TO_PROCESS_TO=500

##    importer.restartCollectionImport(options)
##    cp.Corpus.createAndInitializeDatabase()
    cp.Corpus.matcher=AANReferenceMatcher("g:\\nlp\\phd\\aan\\release\\acl_full.txt")

    importer.use_celery = True
    importer.importCorpus("g:\\nlp\\phd\\aac\\inputXML",file_mask="*-paper.xml", import_options=options)
예제 #4
0
def import_pmc_corpus():
    """
        Do the importing of the PMC corpus
    """

    def getPMC_corpus_id(filename):
        """
            Returns the PMC id for a file
        """
##        return os.path.split(filename)[1].replace("-paper.xml","").lower()
        return ""

    importer=CorpusImporter(reader=JATSXMLReader())
    importer.collection_id="PMC"
    importer.import_id="initial"
##    importer.generate_corpus_id=getACL_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc")

    importer.importCorpus("g:\\nlp\\phd\\pmc\\inputXML",file_mask="*.nxml")
    importer.updateInCollectionReferences(cp.Corpus.listPapers(), {})