def download_data(): ''' Download the data if needed This function uses the cord-19-tools library to download the files. Running this function in an empty directory will result in the latest version of the data being downloaded. Note that if the data already appears to be downloaded, this function will skip this step. returns : nothing ''' if not path.exists("metadata.csv"): print("Downloading the metadata file") r = requests.get( "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv" ) with open("metadata.csv", "wb") as f: f.write(r.content) else: print("Metadata Already Downloaded") if not path.exists("comm_use_subset"): print("Downloading the CORD-19 Dataset") try: cotools.download() except: print( "cotools had some problems downloading some of the data. Continuing with downloaded data" ) else: print("Dataset already download")
def download(): log.info("Start downloading CORD-19 Dataset...") os.makedirs(config.DATA_BASE_DIR) cotools.download(dir=config.DATA_BASE_DIR) log.info("Finished downloading CORD-19 Dataset...")
import cotools as co from pprint import pprint import os import sys downloaded = True # change me if you havent downloaded the data if not downloaded: co.download(dir='data', match="2020-04-10", regex=True) pprint(os.listdir('data')) data = co.Paperset('data/custom_license') print(str(sys.getsizeof(data)) + ' bytes') print(f"{len(data)} papers") print() print("How data[index] looks like:") pprint(data[13]) print() print("How text looks like") pprint(co.text(data[13])) print() print("How abstract looks like") try: pprint(co.abstract(data[13])) except KeyError: print("Abstract Not Found")
from pprint import pprint import os import cotools help(cotools.download) from datetime import datetime from datetime import timedelta cotools.download(dir="data") import pdb; pdb.set_trace() # XXX BREAKPOINT # noncomm = cotools.Paperset("data/noncomm_use_subset") data = cotools.Paperset("data/noncomm_use_subset") text(data[-1]) # pprint(data[0]) # print(type(data[0])) # get the text for one feature cotools.text(data[0]) cotools.texts(data[:15])
text = {'text': wa.request_ncbo_plus(value)} d_json["back_matter"].append(text) pbar.update() Output().save_json(d_json, path_output + '/ncbo/' + folder + '/' + d["paper_id"] + '.json') return d_json if __name__ == '__main__': # Path to the CORD-19 dataset project_resources = Config.project_resources # Path where the annotated files will be saved path_output = Config.corpus_annotated pathlib.Path(os.path.dirname(project_resources)).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.dirname(path_output)).mkdir(parents=True, exist_ok=True) if Config.DOWNLOAD_CORPUS: cotools.download(dir=project_resources) wa = WrapperAnnotator() folders_corpus = ["pdf_json", "pmc_json"] for folder in folders_corpus: data = cotools.Paperset(project_resources + '/' + folder) # You may want to change the number of workers if Config.ENTITY_FISHING: with tqdm.tqdm(total=len(data)) as pbar: with concurrent.futures.ProcessPoolExecutor() as executor: executor.map(func_entity_fishing, data) if Config.DBPEDIA_SPOTLIGHT: with tqdm.tqdm(total=len(data)) as pbar: with concurrent.futures.ProcessPoolExecutor() as executor: