Пример #1
0
def import_sapienta_pmc_corpus():
    """
        Do the importing of the Sapienta-annotated PMC corpus
    """
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    importer=CorpusImporter(reader=SapientaJATSXMLReader())
    importer.collection_id="PMC_CSC"
    importer.import_id="initial"
    importer.generate_corpus_id=getPMC_CSC_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)

    options={
##        "reload_xml_if_doc_in_collection": True, # default: False
##        "list_missing_references":True, # default: False
        "convert_and_import_docs":False, # default: True
##        "update_doc_references":False, # default: True
        "force_generate_resolvable_citations":True, # default: False

        "force_collection_id":"PMC_CSC", # this sets the collection_id again after updating references. Used for fixes
        "force_import_id":"fixed",
    }

##    corpus_import.FILES_TO_PROCESS_FROM=4500
##    corpus_import.FILES_TO_PROCESS_TO=10

##    importer.restartCollectionImport(options)

    importer.use_celery = True
    importer.importCorpus("g:\\nlp\\phd\\pmc_coresc\\inputXML", file_mask="*.xml", import_options=options)
Пример #2
0
def connectToElastic():
    """
        Does the basics of connecting to the ES endpoint
    """
    cp.useElasticCorpus()
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.Corpus.connectCorpus(r"g:\nlp\phd\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
Пример #3
0
def import_aac_corpus():
    """
        Do the importing of the AAC corpus
    """
    from minerva.squad.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    importer=CorpusImporter(reader=PaperXMLReader())
    importer.collection_id="AAC"
    importer.import_id="initial"
    importer.generate_corpus_id=getACL_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)

    options={
##        "list_missing_references":True, # default: False
        "convert_and_import_docs":False, # default: True
    }

##    corpus_import.FILES_TO_PROCESS_FROM=10222
##    corpus_import.FILES_TO_PROCESS_TO=500

##    importer.restartCollectionImport(options)
##    cp.Corpus.createAndInitializeDatabase()
    cp.Corpus.matcher=AANReferenceMatcher("g:\\nlp\\phd\\aan\\release\\acl_full.txt")

    importer.use_celery = True
    importer.importCorpus("g:\\nlp\\phd\\aac\\inputXML",file_mask="*-paper.xml", import_options=options)
Пример #4
0
def connectToCorpus():
    """
    """
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")
Пример #5
0
def main():
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("PMC_CSC")

    exp=Experiment(experiment, options, True)
    exp.run()
def main():

    from minerva.squad.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")

    # train_set=
    pass
Пример #7
0
def main():
    from minerva.squad.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")
##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    exp=Experiment(experiment, options, False)
    exp.run()
Пример #8
0
def main():
    import json
    cp.useElasticCorpus()
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.Corpus.connectCorpus("",endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
##    doc=cp.Corpus.loadSciDoc("957e1fcf-d5b4-41dc-af32-7db08f1d2ded")
##    print getAnnotationStatistics(doc)
    computeAnnotationStatistics("957e1fcf-d5b4-41dc-af32-7db08f1d2ded")
    print json.dumps(cp.Corpus.getStatistics("957e1fcf-d5b4-41dc-af32-7db08f1d2ded"),indent=3)
    pass
Пример #9
0
def main():
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT

    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    ##    cp.Corpus.setCorpusFilter(collection_id="PMC_CSC")

    ##    add_statistics_to_all_files(use_celery=True)
    ##    add_statistics_to_all_files(use_celery=False, max_files=10)
    aggregate_statistics()
    ##    fix_collection_id()
    pass
Пример #10
0
def fix_broken_scidocs():
    """
        Iterates through the papers already in the collection. Tries to load
        their scidoc. If KeyError occurs, it loads the XML again
    """
    cp.useElasticCorpus()
    import minerva.squad.celery_app as celery_app
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc",
            endpoint={"host":celery_app.MINERVA_ELASTICSEARCH_SERVER_IP,
            "port":celery_app.MINERVA_ELASTICSEARCH_SERVER_PORT})
    importer=CorpusImporter("PMC_CSC","initial", use_celery=True)
    importer.generate_corpus_id=getPMC_CSC_corpus_id
    importer.reloadSciDocsOnly("metadata.collection_id:\"PMC_CSC\"",
        "g:\\nlp\\phd\\pmc_coresc\\inputXML", "*.xml")
Пример #11
0
def basicTest():
    """
    """
    import minerva.db.corpora as cp

    drive="g"
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\aac")
##    doc_list=cp.Corpus.selectRandomInputFiles(3,"*-paper.xml")

    doc_list=[r"anthology\W\W10-0402-paper.xml"]
##    doc_list=[r"anthology\W\W11-2166-paper.xml"]

    generateSideBySide(doc_list)
Пример #12
0
def fix_citation_parent_aac():
    """
    """
    from minerva.proc.results_logging import ProgressIndicator
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac")
    guids=cp.Corpus.listPapers("metadata.collection_id:\"AAC\"")
    progress=ProgressIndicator(True, len(guids), True)
    for guid in guids:
        doc=cp.Corpus.loadSciDoc(guid)
        for cit in doc.citations:
            if "parent" in cit:
                cit["parent_s"]=cit.pop("parent")
        cp.Corpus.saveSciDoc(doc)
        progress.showProgressReport("Fixing badly imported PaperXML")
Пример #13
0
def main():
    drive="g"
##    cp.useLocalCorpus()
    cp.useElasticCorpus()

    cp.Corpus.setCorpusFilter(collection_id="AAC")

    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\aac")
##    generator=AtharQueryGenerator(drive+r":\NLP\PhD\citation_context\doc_dict.json", reassign_guids=True)
##    experiment["test_files"]=cp.Corpus.listPapers("year >= 2011")
    experiment["test_files"]=cp.Corpus.listPapers("year:>=2011")

    exp=Experiment(experiment, options)
    exp.run()
    pass
Пример #14
0
def main():
    drive="g"
##    cp.useLocalCorpus()
    cp.useElasticCorpus()

    cp.Corpus.setCorpusFilter(collection_id="AAC")

    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\aac")
    generator=AtharQueryGenerator(drive+r":\NLP\PhD\citation_context\doc_dict.json", reassign_guids=True)
    experiment["test_files"]=generator.docs.keys()

    exp=Experiment(experiment, options)
    exp.query_generator=generator
    exp.run()
    pass
Пример #15
0
def fix_authors_full_corpus():
    """
        Fixes authors in each metadata entry having a "papers" key which they
        shouldn't
    """
    from minerva.proc.results_logging import ProgressIndicator
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc")
    guids=cp.Corpus.listPapers()
    progress=ProgressIndicator(True, len(guids), True)
    for guid in guids:
        doc_meta=cp.Corpus.getMetadataByGUID(guid)
        new_authors=[]
        for old_author in doc_meta.authors:
            del old_author["papers"]
        cp.Corpus.updatePaper(doc_meta)
        progress.showProgressReport("Removing redundant author information")
Пример #16
0
def import_pmc_corpus():
    """
        Do the importing of the PMC corpus
    """

    def getPMC_corpus_id(filename):
        """
            Returns the PMC id for a file
        """
##        return os.path.split(filename)[1].replace("-paper.xml","").lower()
        return ""

    importer=CorpusImporter(reader=JATSXMLReader())
    importer.collection_id="PMC"
    importer.import_id="initial"
##    importer.generate_corpus_id=getACL_corpus_id

##    cp.useLocalCorpus()
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc")

    importer.importCorpus("g:\\nlp\\phd\\pmc\\inputXML",file_mask="*.nxml")
    importer.updateInCollectionReferences(cp.Corpus.listPapers(), {})
Пример #17
0
def connectToCorpus():
    """
    """
    cp.useElasticCorpus()
    from minerva.squad.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.Corpus.connectCorpus(r"g:\nlp\phd\pmc_coresc", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
Пример #18
0
def main():
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc", endpoint={"host":"129.215.90.202", "port":9200})

##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    experiment["test_files"]=[
##                                "bdc9a118-cb76-4d26-9c4d-e886794428f5",
##                                "65a4319e-4324-4529-96fa-66c52e392da0",
##                                "0cc28fb0-b116-4990-b816-3dc066273c34",
##                                "ef3e4284-c527-4e83-9b59-f8996b09df76",
##                                "b3129460-d284-4f69-83a8-f87f588e7800",
##                                "d8548dab-ff28-4f93-b2ae-16887e59e8ad",
##                                "42efd8ec-4c06-4754-a527-3045eed87766",
##                                "f4374057-7ab2-4567-b73b-aa5b72328d3e",
##                                "cbf989c5-79f5-4317-8515-2192e2a3fe2a",
##                                "37d1cc24-68a5-4a36-b55d-94acdfad08c1",
##                                "2b5202ec-e71b-4d1a-8ef4-439c4f505342",
##                                "11ba9f31-13f8-4a40-8bfc-6c9c7725e7ba",
##                                "e047f55f-ff56-44a6-a07c-794887330752",
##                                "d39d353a-f9ca-4ce3-ab42-e9a16f5bd372",
##                                "a407716f-4516-4cba-9c52-d4e3b09bcda6",
##                                "680724b2-50e7-4809-a86f-e63326059f7e",
##                                "1ce857ab-7692-4a95-9ba0-f517179a940e",
##                                "e12b2e84-a91d-4170-88a6-6ba983ceab1b",
##                                "5a6c0a35-dbe0-486a-8edf-3c3d3638f06e",
##                                "c40d5876-208c-4eb4-b239-652ed14f8560",
##                                "9a764770-fd73-474e-8f38-cf0128371e2c",
##                                "54432fc8-c1c4-42f9-95b0-c5fad39f8317",
##                                "a7dab0f1-5891-4d83-92c2-d25069c49d27",
##                                "283ed90d-3ff9-4161-8c4d-4e55a555973e",
##                                "6478c6ca-e16c-473f-9f4c-060143b3cc8f",
##                                "666f2c58-3180-465b-877c-28d14cbcdf98",
##                                "f5dedb99-f2a1-4ae9-b4a0-3c23e33cbfc9",
##                                "e5ed924b-8b78-4c76-bb6c-54d9790c8a15",
##                                "b8ace4e7-8523-471f-847b-b45aee8ccfc1",
##                                "ff30447d-828e-4699-bbf7-ce586aae9764",
##                                "aec8d55c-43e0-42cb-b832-77f888c2325a",
##                                "067862a3-d8fd-4252-b831-f6f120af82a1",
##                                "64956609-5a4d-4e05-bad1-0445c3d1834d",
##                                "cd1cd1ec-ecc9-4e70-96b3-7f1447ec0df3",
##                                "d61b922b-622b-440c-b040-3db563fd6f0e",
##                                "51d71d97-5abb-4a4d-ba77-7d18a11343f0",
##                                "b4c2215a-0a38-4e44-a5ab-f0d0114d89fc",
##                                "d3265a02-86ba-47e9-879c-a15043ca5808",
##                                "3e53830f-33a5-4192-9159-bcd01a3e66d3",
##                                "be50acb5-e165-4afb-b259-eeb9f28d0f2e",
##                                "fb8e6675-46d9-41c8-8ba0-598842a63fe8",
##                                "34043f1e-3424-4c4a-b782-9489fc274db5",
##                                "e07f7715-d400-4958-a0ac-2e6dab3b1843",
##                                "44f52aea-cae3-4e1c-85ce-da0038cbcea1",
##                                "b99f5b2d-6edd-4787-a50d-fef7d030ff05",
##                                "55ef06b7-ffc6-4e43-9362-daf4b9f6735f",
##                                "89c63f73-988a-4ece-99fd-e6c91fc9f6fd",
##                                "83293e90-8f3e-45db-8dae-49a179568d3e",
##                                "7fbcd237-d40d-4d44-9c9f-7e2f462e547e",
##                                "5303a3a3-c1cd-458c-9bef-56df3080169d",
##
##                                "a3af153f-cf9d-40ea-b64d-c6e52e0a187b",
##                                "38a67959-7ad9-426d-8357-51ab376b7a4b",
##                                "5f71b848-2f22-49bf-b32a-c1cc441a6dbe",
##                                "e875b2dc-3757-4728-8e8b-47c6a1d8241c",
##                                "a1bd31a8-66bc-4be9-aa85-e2e821aa18f5",
##                                "ef030f01-cdcb-4aaf-8ec1-c1a8778095da",
##                                "5b4f7822-1127-4d45-84ca-6755e1debaab",
##                                "a11c9c92-e294-4dfc-8e73-f432ad460776",
##                                "6c647939-ef22-4d8b-b887-121272168829",
##                                "d4c97daa-790c-40fe-a17e-fffa8e7fbd36",
##                                "65ee7543-549d-4821-b51d-8fc27dbe85cb",
##                                "4fe8ae7f-47bf-41fd-95ef-14ee7831f37e",
##                                "45d1bde0-2bd5-413b-89e3-9151d5a73ffb",
##                                "7bedaa57-30ff-4569-8456-59236171a80f",
##                                "67a054e0-744e-477b-80a4-06a268064bc7",
##                                "51a51cbb-952b-450b-970e-f6a23ecf9ce6",
##                                "b14209b3-d868-41cd-b1d0-f1a1489220f3",
##                                "53230a94-3baf-4825-a039-e8125890e737",
##                                "bb576feb-658e-45ef-810a-617b586159e5",
##                                "7d5ad1b5-2f3d-4728-b583-ec1ebbc3dac6",
##                                "f27cf9db-0d2d-490b-9917-c076a5ebca2c",
##                                "3b6679e5-deae-43ee-a98b-cac7029e92f4",
##                                "0d44cbba-1989-4654-b250-1b41285359ea",
##                                "b0cceb78-5f66-4084-accb-171040521cda",
##                                "18bf7b21-2456-49da-882a-06032ec46bec",
##                                "588b99bd-c358-440b-b30c-e1f3dc10b96b",
##                                "c4c1d5c0-7f40-465a-bbdb-351b4c9948a8",
##                                "0efcb373-ecd3-4e10-9f2a-1bbd3a6cbf58",
##                                "9bb0db11-2821-4d55-8f34-9bfd5d58f444",
##                                "49ff3f83-b4d7-4979-800f-785460c95552",
##                                "58e0a5d1-6343-4e2e-b544-6f690bff023e",
##                                "5a84843d-d7b0-43b0-846c-d30d3196ee8a",
##                                "6f244f35-8f61-4eb9-9de0-dfbbca63532b",
##                                "9d3cf2ea-162b-4e78-a311-b7333ad65c3a",
##                                "75e80547-50f5-4a12-8db6-e799a2e5029b",
##                                "f6c70cbd-e6c3-4ea5-b99f-ac0c455d832a",
##                                "f72d2af0-1e8a-40f5-9acf-ddbe9ddc4a7b",
##                                "9a81337e-1280-4b11-9b00-7e516d298ea1",
##                                "c7a83006-6ed3-46ec-b476-c49770dc4979",
##                                "aa45f968-61a4-421d-a532-a036fb8336ef",
##                                "f38092bc-b1e2-4ba8-ad60-b964825e52ac",
##                                "a755f020-c04d-4640-8b43-fb63b560bd6e",
##                                "1f06edd3-09d6-4033-b65e-a96d6a78f748",
##                                "33273c27-bcb7-4a4d-b339-c8af16c97b91",
##                                "802d2b57-0425-410e-82b6-f0024bc6f0dd",
##                                "699e837a-a662-49a0-b4c5-b3ef113eff34",
##                                "5b019a09-e21f-4109-a757-2c8396c8f169",
##                                "d4a70f39-7c5c-4566-8b5c-72208f3929ea",
##                                "6cf4f22d-c77b-4f0a-9e3c-378d7803f62b",

##                                "5cc55656-d309-4906-9cbf-7e34e734c352",
##                                "4222902b-e5fc-4eef-a7a7-79ec85d8e7c0",
##                                "4e408d49-6e51-441a-9c1c-8720d0d7032a",
##                                "aba078e1-c385-45b2-9adf-1ab7901b373b",
##                                "772eaabf-8996-486f-9bba-355cbf0c15e1",
##                                "38048193-6565-45ea-9950-64c7e4c266a3",
##                                "4d36eeb9-9121-4510-847e-99b80c77473e",
##                                "7b9e39c9-18a3-4112-ba9f-36b70d60f60f",
##                                "e31fd474-e2a0-4b3c-9d36-b31003b3bbc6",
##                                "69b92870-e050-4277-bd00-08f79aa6d9e6",
##                                "a67be750-73dc-427b-ac6a-e46adcaf7430",
##                                "1f8a95c4-856a-4f39-9c45-52d309d8c075",
##                                "b3606482-e22f-4809-948a-385a4f1e47cb",
##                                "7a4a67fb-3f4c-4c26-a060-775e8a4b7480",
##                                "bc39625f-1bc1-49ce-8ea3-4f8debe90b01",
##                                "202ef49f-c3f9-4d3a-b971-2d8094c06242",
##                                "28967c8d-2584-4898-9b62-0bfc669e2490",
##                                "355d6857-06ee-4430-a511-aaf0e8eaf23d",
##                                "0ce7eebd-0815-4f5d-b1c9-fafb65584994",
##                                "870c4608-525d-44d1-960f-4eb73589618c",
##                                "2420a665-d848-459f-9f51-456275d42e8b",
##                                "deb5362b-af92-4973-970f-ebe3fec12ee9",
##                                "d244e4d9-808c-4abd-a627-02716e9609c9",
##                                "de3f08f8-12fa-41d1-82cc-30c2c43cf52e",
##                                "575e9d63-94d7-483d-a980-8c974afc0ad9",
##                                "31410622-7133-4472-8225-8cf6b1eb1683",
##                                "80f3bd59-c5d6-43a0-97e3-155c1af50275",
##                                "cf2e8b40-5fab-4b17-acb1-6b676d909aa6",
##                                "51375367-8a16-4070-bab7-5ebcca3427c4",
##                                "dc293831-b099-45e2-a9ef-87ec8ccb8722",
##                                "aad3943f-37f9-4774-aa77-f312650b699e",
##                                "cdf828fc-2fb5-4c6f-8b42-8ff7c8ff0ff0",
##                                "6569f681-77e3-4ceb-a956-04e7a751f2b3",
##                                "2edffd94-b1db-46b7-bc4b-e4da9dcf4f51",
##                                "45a922bc-814a-40bd-a76a-fcbeca77bc81",
##                                "be4f19f7-de07-4674-8f03-1fbec9c7dd04",
##                                "48d9f4cf-c081-4520-b350-6ca3142987a7",
##                                "f35243d1-a3e3-4402-99b3-e576a27cde0d",
##                                "e8f567f8-3179-4214-bcbc-79332c1cfd1d",
##                                "209e32f7-a3cd-4e86-afee-2935a1f25514",
##                                "1cd47a2c-58c1-4c89-a689-cbdc0dd1f6b7",
##                                "2c64d4d5-3883-4fee-8c2c-1c0afb3835cf",
##                                "1323e0b5-c986-4ca6-855a-0b147d938e50",
##                                "d293f62a-983f-4ddc-a227-84d82bb36af1",
##                                "5b6439c9-466d-4bc0-aff2-e85de8eb9337",
##                                "da2b0b43-26b1-458b-b57a-83279ceb314e",
##                                "c21e2afc-0f92-490d-aa1b-7f826a83221d",
##                                "c5e67372-cf98-45db-bb7a-e3f4e7662774",
##                                "c7f91884-cfc8-406b-919e-658008c21279",
##                                "753b9d9a-ce8d-4fba-ac74-106526416738",
##                                "799680bf-5150-4fb2-b9b6-91fd0edc2593",
                            ]

    exp=Experiment(experiment, options, True)
    exp.run()
def main():
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("", endpoint={"host":ES_SERVER, "port":9200})
    annotate_one("c3eaadb3-0d3c-4d76-b485-83e8cb2af70f")
    pass