Exemplos de download em Python, exemplos de cotools.download em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: dataprep.py Projeto: sandialabs/galen-view

def download_data():
    ''' Download the data if needed

    This function uses the cord-19-tools library to
    download the files.  Running this function in an empty 
    directory will result in the latest version of the
    data being downloaded.

    Note that if the data already appears to be downloaded,
    this function will skip this step.

    returns : nothing
    '''
    if not path.exists("metadata.csv"):
        print("Downloading the metadata file")
        r = requests.get(
            "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv"
        )
        with open("metadata.csv", "wb") as f:
            f.write(r.content)
    else:
        print("Metadata Already Downloaded")

    if not path.exists("comm_use_subset"):
        print("Downloading the CORD-19 Dataset")
        try:
            cotools.download()
        except:
            print(
                "cotools had some problems downloading some of the data.  Continuing with downloaded data"
            )
    else:
        print("Dataset already download")

Exemplo n.º 2

0

Exibir arquivo

def download():

    log.info("Start downloading CORD-19 Dataset...")
    os.makedirs(config.DATA_BASE_DIR)
    cotools.download(dir=config.DATA_BASE_DIR)
    log.info("Finished downloading CORD-19 Dataset...")

Exemplo n.º 3

0

Exibir arquivo

import cotools as co
from pprint import pprint
import os
import sys

downloaded = True  # change me if you havent downloaded the data

if not downloaded:
    co.download(dir='data', match="2020-04-10", regex=True)

pprint(os.listdir('data'))

data = co.Paperset('data/custom_license')
print(str(sys.getsizeof(data)) + ' bytes')

print(f"{len(data)} papers")

print()
print("How data[index] looks like:")
pprint(data[13])

print()
print("How text looks like")
pprint(co.text(data[13]))

print()
print("How abstract looks like")
try:
    pprint(co.abstract(data[13]))
except KeyError:
    print("Abstract Not Found")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test.py Projeto: morganpartee/cord-19-tools

from pprint import pprint
import os

import cotools

help(cotools.download)

from datetime import datetime
from datetime import timedelta


cotools.download(dir="data")

import pdb; pdb.set_trace()  # XXX BREAKPOINT

# noncomm = cotools.Paperset("data/noncomm_use_subset")

data = cotools.Paperset("data/noncomm_use_subset")

text(data[-1])


# pprint(data[0])
# print(type(data[0]))

# get the text for one feature
cotools.text(data[0])

cotools.texts(data[:15])

Exemplo n.º 5

0

Exibir arquivo

                text = {'text': wa.request_ncbo_plus(value)}
                d_json["back_matter"].append(text)
    pbar.update()
    Output().save_json(d_json, path_output + '/ncbo/' + folder + '/' + d["paper_id"] + '.json')
    return d_json


if __name__ == '__main__':
    # Path to the CORD-19 dataset
    project_resources = Config.project_resources
    # Path where the annotated files will be saved
    path_output = Config.corpus_annotated
    pathlib.Path(os.path.dirname(project_resources)).mkdir(parents=True, exist_ok=True)
    pathlib.Path(os.path.dirname(path_output)).mkdir(parents=True, exist_ok=True)
    if Config.DOWNLOAD_CORPUS:
        cotools.download(dir=project_resources)
    wa = WrapperAnnotator()
    folders_corpus = ["pdf_json", "pmc_json"]

    for folder in folders_corpus:
        data = cotools.Paperset(project_resources + '/' + folder)

        # You may want to change the number of workers
        if Config.ENTITY_FISHING:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor:
                    executor.map(func_entity_fishing, data)

        if Config.DBPEDIA_SPOTLIGHT:
            with tqdm.tqdm(total=len(data)) as pbar:
                with concurrent.futures.ProcessPoolExecutor() as executor: