示例#1
0
def download_data():
    ''' Download the data if needed

    This function uses the cord-19-tools library to
    download the files.  Running this function in an empty 
    directory will result in the latest version of the
    data being downloaded.

    Note that if the data already appears to be downloaded,
    this function will skip this step.

    returns : nothing
    '''
    if not path.exists("metadata.csv"):
        print("Downloading the metadata file")
        r = requests.get(
            "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv"
        )
        with open("metadata.csv", "wb") as f:
            f.write(r.content)
    else:
        print("Metadata Already Downloaded")

    if not path.exists("comm_use_subset"):
        print("Downloading the CORD-19 Dataset")
        try:
            cotools.download()
        except:
            print(
                "cotools had some problems downloading some of the data.  Continuing with downloaded data"
            )
    else:
        print("Dataset already download")
示例#2
0
def download():

    log.info("Start downloading CORD-19 Dataset...")
    os.makedirs(config.DATA_BASE_DIR)
    cotools.download(dir=config.DATA_BASE_DIR)
    log.info("Finished downloading CORD-19 Dataset...")
示例#3
0
import cotools as co
from pprint import pprint
import os
import sys

downloaded = True  # change me if you havent downloaded the data

if not downloaded:
    co.download(dir='data', match="2020-04-10", regex=True)

pprint(os.listdir('data'))

data = co.Paperset('data/custom_license')
print(str(sys.getsizeof(data)) + ' bytes')

print(f"{len(data)} papers")

print()
print("How data[index] looks like:")
pprint(data[13])

print()
print("How text looks like")
pprint(co.text(data[13]))

print()
print("How abstract looks like")
try:
    pprint(co.abstract(data[13]))
except KeyError:
    print("Abstract Not Found")
示例#4
0
from pprint import pprint
import os

import cotools

help(cotools.download)

from datetime import datetime
from datetime import timedelta


cotools.download(dir="data")

import pdb; pdb.set_trace()  # XXX BREAKPOINT

# noncomm = cotools.Paperset("data/noncomm_use_subset")

data = cotools.Paperset("data/noncomm_use_subset")

text(data[-1])


# pprint(data[0])
# print(type(data[0]))

# get the text for one feature
cotools.text(data[0])

cotools.texts(data[:15])

示例#5
0
                text = {'text': wa.request_ncbo_plus(value)}
                d_json["back_matter"].append(text)
    pbar.update()
    Output().save_json(d_json, path_output + '/ncbo/' + folder + '/' + d["paper_id"] + '.json')
    return d_json


if __name__ == '__main__':
    # Path to the CORD-19 dataset
    project_resources = Config.project_resources
    # Path where the annotated files will be saved
    path_output = Config.corpus_annotated
    pathlib.Path(os.path.dirname(project_resources)).mkdir(parents=True, exist_ok=True)
    pathlib.Path(os.path.dirname(path_output)).mkdir(parents=True, exist_ok=True)
    if Config.DOWNLOAD_CORPUS:
        cotools.download(dir=project_resources)
    wa = WrapperAnnotator()
    folders_corpus = ["pdf_json", "pmc_json"]

    for folder in folders_corpus:
        data = cotools.Paperset(project_resources + '/' + folder)

        # You may want to change the number of workers
        if Config.ENTITY_FISHING:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor:
                    executor.map(func_entity_fishing, data)

        if Config.DBPEDIA_SPOTLIGHT:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor: