def import_sapienta_pmc_corpus(): """ Do the importing of the Sapienta-annotated PMC corpus """ from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT importer=CorpusImporter(reader=SapientaJATSXMLReader()) importer.collection_id="PMC_CSC" importer.import_id="initial" importer.generate_corpus_id=getPMC_CSC_corpus_id ## cp.useLocalCorpus() cp.useElasticCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT) options={ ## "reload_xml_if_doc_in_collection": True, # default: False ## "list_missing_references":True, # default: False "convert_and_import_docs":False, # default: True ## "update_doc_references":False, # default: True "force_generate_resolvable_citations":True, # default: False "force_collection_id":"PMC_CSC", # this sets the collection_id again after updating references. Used for fixes "force_import_id":"fixed", } ## corpus_import.FILES_TO_PROCESS_FROM=4500 ## corpus_import.FILES_TO_PROCESS_TO=10 ## importer.restartCollectionImport(options) importer.use_celery = True importer.importCorpus("g:\\nlp\\phd\\pmc_coresc\\inputXML", file_mask="*.xml", import_options=options)
def import_aac_corpus(): """ Do the importing of the AAC corpus """ from minerva.squad.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT importer=CorpusImporter(reader=PaperXMLReader()) importer.collection_id="AAC" importer.import_id="initial" importer.generate_corpus_id=getACL_corpus_id ## cp.useLocalCorpus() cp.useElasticCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT) options={ ## "list_missing_references":True, # default: False "convert_and_import_docs":False, # default: True } ## corpus_import.FILES_TO_PROCESS_FROM=10222 ## corpus_import.FILES_TO_PROCESS_TO=500 ## importer.restartCollectionImport(options) ## cp.Corpus.createAndInitializeDatabase() cp.Corpus.matcher=AANReferenceMatcher("g:\\nlp\\phd\\aan\\release\\acl_full.txt") importer.use_celery = True importer.importCorpus("g:\\nlp\\phd\\aac\\inputXML",file_mask="*-paper.xml", import_options=options)
def import_pmc_corpus(): """ Do the importing of the PMC corpus """ def getPMC_corpus_id(filename): """ Returns the PMC id for a file """ ## return os.path.split(filename)[1].replace("-paper.xml","").lower() return "" importer=CorpusImporter(reader=JATSXMLReader()) importer.collection_id="PMC" importer.import_id="initial" ## importer.generate_corpus_id=getACL_corpus_id ## cp.useLocalCorpus() cp.useElasticCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc") importer.importCorpus("g:\\nlp\\phd\\pmc\\inputXML",file_mask="*.nxml") importer.updateInCollectionReferences(cp.Corpus.listPapers(), {})