Пример #1
0
def ez_connect(corpus="AAC", es_config=None):
    """
    Simplifies connecting to the Corpus

    :param corpus:
    :return: corpus instance
    """
    # global MINERVA_ELASTICSEARCH_ENDPOINT
    root_dir = ""
    if corpus == "AAC":
        root_dir = getRootDir("aac")
    elif corpus == "PMC_CSC":
        root_dir = getRootDir("pmc_coresc")
    elif corpus is None:
        root_dir = ""
    else:
        raise ValueError("Unknown corpus")

    cp.useElasticCorpus()

    if es_config:
        celery_app.MINERVA_ELASTICSEARCH_ENDPOINT = celery_app.set_config(
            es_config)

    cp.Corpus.connectCorpus(root_dir,
                            endpoint=celery_app.MINERVA_ELASTICSEARCH_ENDPOINT)

    if corpus:
        cp.Corpus.setCorpusFilter(corpus)
    return cp.Corpus
Пример #2
0
def import_aac_corpus(endpoint, use_celery=True):
    """
        Do the importing of the AAC corpus
    """
    importer = CorpusImporter(reader=PaperXMLReader())
    importer.collection_id = "AAC"
    importer.import_id = "initial"
    importer.generate_corpus_id = getACL_corpus_id

    options = {
        # "list_missing_references":True, # default: False
        # "convert_and_import_docs": False,  # default: True
    }

    ##    corpus_import.FILES_TO_PROCESS_FROM=10222
    ##    corpus_import.FILES_TO_PROCESS_TO=500

    ##    importer.restartCollectionImport(options)
    cp.Corpus.matcher = AANReferenceMatcher(
        os.path.join(getRootDir("aan"), "release" + os.sep + "acl_full.txt"))

    importer.use_celery = use_celery
    importer.importCorpus(os.path.join(getRootDir("aac"), "inputXML"),
                          file_mask="*-paper.xml",
                          import_options=options)
Пример #3
0
def main():
    from multi.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    root_dir = getRootDir("aac")

    cp.Corpus.connectCorpus(root_dir, endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")
    ##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    exp = Experiment(experiment, options, False)
    exp.run()
Пример #4
0
def main():
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus(getRootDir("aac"),
                            endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")

    # fix_sentence_splitting_in_docs(cp.Corpus.listPapers())
    fix_stranded_citations_in_docs(cp.Corpus.listPapers())

    global num_removed_sent, num_papers_removed_sent
    print("Removed {} sentences from {} papers".format(
        num_removed_sent, num_papers_removed_sent))
Пример #5
0
def find_new_citations_in_aac():
    """
    Does another run through each AAC scidoc and tries to find citations that may have been missed

    :return:
    """
    from multi.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    from tqdm import tqdm

    cp.useElasticCorpus()
    cp.Corpus.connectCorpus(getRootDir("aac"),
                            endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")

    total_found = 0
    total_could_match = 0
    docs_with_new_ones = 0
    existing_citations = 0

    counter = tqdm(cp.Corpus.listPapers())

    for guid in counter:
        counter.set_description(
            "{} docs_with_new_ones, {} total_found, {} total_could_match, {} existing_citations, "
            .format(docs_with_new_ones, total_found, total_could_match,
                    existing_citations))
        doc = cp.Corpus.loadSciDoc(guid)

        for sent in doc.allsentences:
            existing_citations += len(sent.get("citations", []))
            new_citations, citations_found = annotatePlainTextCitationsInSentence(
                sent, doc)
            if len(citations_found) > 0:
                # print(len(new_citations),":",new_citations)
                total_found += len(citations_found)
                total_could_match += len(new_citations)
                docs_with_new_ones += 1
                # print("\n NEW CITATION:", sent["text"])
                # print(citations_found)
                # print()
            else:
                if len(sent.get("citations", [])) > 0:
                    # print("ALREADY ANNOTATED:", sent["text"], "\n")
                    pass

        cp.Corpus.saveSciDoc(doc)

    print("Total citations found: ", total_found)
    print("Total citations could match: ", total_could_match)
    print("Docs with new citations: ", docs_with_new_ones)
    print("Previously annotated citations: ", existing_citations)
Пример #6
0
def fix_citation_parent_aac():
    """
    """
    from proc.results_logging import ProgressIndicator
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus(getRootDir("aac"))

    guids = cp.Corpus.listPapers({"match": {"metadata.collection_id": "AAC"}})
    progress = ProgressIndicator(True, len(guids), True)
    for guid in guids:
        doc = cp.Corpus.loadSciDoc(guid)
        for cit in doc.citations:
            if "parent" in cit:
                cit["parent_s"] = cit.pop("parent")
        cp.Corpus.saveSciDoc(doc)
        progress.showProgressReport("Fixing badly imported PaperXML")
Пример #7
0
def main():
    from multi.celery_app import set_config

    endpoint = set_config("aws-server")

    cp.useElasticCorpus()
    cp.Corpus.connectCorpus(getRootDir("aac"), endpoint=endpoint)
    cp.Corpus.createAndInitializeDatabase()

    # import_aac_corpus(endpoint, use_celery=False)

    # fix_citation_parent_aac()

    ##    import corpora as cp
    ##    cp.useElasticCorpus()
    ##    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac")
    ##    print(cp.Corpus.listPapers("year:>2010")[:100])

    pass
Пример #8
0
from db.elastic_corpus import ElasticCorpus
import db.corpora as cp
from proc.general_utils import getRootDir
from multi.config import set_config

root_dir = getRootDir("aac")

cp.useElasticCorpus()
# cp.Corpus.connectCorpus(root_dir, endpoint=GCP_ENDPOINT)
cp.Corpus.connectCorpus(root_dir, endpoint=set_config("aws-server"))

print("")

# for index in ["scidocs", "papers", "venues", "cache", "authors", "links", "missing_references"]:
# for index in ["papers", "venues", "cache", "authors", "links", "missing_references"]:
# for index in ["papers", "venues", "cache", "authors", "links", "missing_references"]:
#     if cp.Corpus.es.indices.exists(index):
#         cp.Corpus.deleteIndex(index)

cp.Corpus.createAndInitializeDatabase()

# if cp.Corpus.es.indices.exists("scidocs"):
#     cp.Corpus.deleteIndex("scidocs")

# settings = {
#     "number_of_shards": 5,
#     "number_of_replicas": 1
# }
# properties = {
#     "scidoc": {"type": "string", "index": "no", "store": True, "doc_values": False},
#     "guid": {"type": "string", "index": "not_analyzed", "store": True},