def get_xml_files(PMClist_file, xml_folder): """ Download fulltext XML files for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param xml_folder: Path to folder where the fulltext XML files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/PMCXXXXX/fullTextXML' if not xml_folder.exists(): xml_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = xml_folder.joinpath(el + '.xml') response = requests.get(url=url.replace('PMCXXXXX', el)) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_pdf_files(PMClist_file, pdf_folder): """ Download PDF files for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param pdf_folder: Path to folder where the pdf files will be saved """ url = 'https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMCXXXXX&blobtype=pdf' if not pdf_folder.exists(): pdf_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = pdf_folder.joinpath(el + '.pdf') response = requests.get(url=url.replace('PMCXXXXX', el)) if response.ok: with path2file.open('wb') as f: f.write(response.content) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_annotations(PMClist_file, annotations_folder): """ Download available EuropePMC annotations for all PMCs in the file PMClist_file :param PMClist_file: Path to text file with full list of PMCs for dataset :param annotations_folder: Path to folder where the annotation files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=PMC%3AXXXXX&format=JSON' if not annotations_folder.exists(): annotations_folder.mkdir() with PMClist_file.open() as fin: pmcs = [el.strip() for el in fin.readlines()] pmcs = [el for el in pmcs if len(el)] for el in pmcs: path2file = annotations_folder.joinpath(el + '.json') response = requests.get(url=url.replace('XXXXX', el.split('PMC')[1])) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Coirrectly processed ' + el) else: printred('Could not retrieve ' + el) return
def get_S2_data(data_folder, csv_file, S2_folder): """ Download Semantic Scholar description of all papers and authors in base dataset :param data_folder: Path to folder with the Bio-protocol .json files :param csv_file: Path to csv file with PMID to PMC translations :param S2_folder: Path to folder where the downloaded papers will be saved """ paperUrl = 'https://api.semanticscholar.org/v1/paper/XXXXX' authorUrl = 'https://api.semanticscholar.org/v1/author/XXXXX' if not S2_folder.exists(): S2_folder.mkdir() papers = [f for f in data_folder.iterdir() if f.name.endswith('.json')] ProtocolID = [] S2ID = [] for idx,el in enumerate(papers): time.sleep(5) doi = '10.21769/BIOPROTOC.' + el.name.split('Bio-protocol')[1].split('.json')[0] printgr('Processing paper ' + el.name + ' (' + str(idx) + '). DOI: ' + doi) response = requests.get(url = paperUrl.replace('XXXXX', doi)) if not response.ok: print(response.status_code) if response.status_code==404: time.sleep(5) response = requests.get(url = paperUrl.replace('XXXXX', doi.lower())) if response.ok: paperdata = json.load(StringIO(response.text)) with S2_folder.joinpath(paperdata['paperId'] +'.json').open('w') as fout: json.dump(paperdata, fout) for author in paperdata['authors']: authorId = author['authorId'] time.sleep(5) response = requests.get(url = authorUrl.replace('XXXXX', str(authorId))) if not response.ok: print(response) print(authorId) else: authordata = json.load(StringIO(response.text)) #Save author data with S2_folder.joinpath('Author' + str(authorId) +'.json').open('w') as fout: json.dump(authordata,fout) ProtocolID.append(el.name.split('.json')[0]) S2ID.append(paperdata['paperId']) with csv_file.open('w') as fout: fout.write('ProtocolID,S2ID\n') [fout.write(PID+','+S2ID+'\n') for PID, S2ID in zip(ProtocolID,S2ID)] return
def get_S2_data(csv_file, S2_folder): """ Download Semantic Scholar description of all papers and authors in base dataset :param csv_file: Path to csv file with PMID to PMC translations :param S2_folder: Path to folder where the downloaded papers will be saved """ paperUrl = 'https://api.semanticscholar.org/v1/paper/XXXXX' authorUrl = 'https://api.semanticscholar.org/v1/author/XXXXX' if not S2_folder.exists(): S2_folder.mkdir() df = pd.read_csv(csv_file) for idx, el in enumerate(df.S2ID.values.tolist()): printgr('Processing paper ' + el + ' (' + str(idx) + ')') time.sleep(5) response = requests.get(url=paperUrl.replace('XXXXX', el)) while not response.ok: print('Sleep') time.sleep(10) response = requests.get(url=paperUrl.replace('XXXXX', el)) paperdata = json.load(StringIO(response.text)) with S2_folder.joinpath(el + '.json').open('w') as fout: json.dump(paperdata, fout) for author in paperdata['authors']: authorId = author['authorId'] time.sleep(5) response = requests.get( url=authorUrl.replace('XXXXX', str(authorId))) if not response.ok: print(response) print(authorId) else: authordata = json.load(StringIO(response.text)) #Save author data with S2_folder.joinpath('Author' + str(authorId) + '.json').open('w') as fout: json.dump(authordata, fout) return
def get_annotationsMED(PMIDs, annotations_folder): """ Download available EuropePMC annotations for all files in the list PMIDs :param PMIDs: list of Pubmed identifiers :param annotations_folder: Path to folder where the annotation files will be saved """ url = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds?articleIds=MED%3AXXXXX&format=JSON' if not annotations_folder.exists(): annotations_folder.mkdir() for el in PMIDs: path2file = annotations_folder.joinpath('PMID' + el + '.json') response = requests.get(url=url.replace('XXXXX', el)) if response.ok: with path2file.open('w') as f: f.write(response.text) printgr('Correctly processed ' + el) else: printred('Could not retrieve ' + el) return
============================================================ """ if lemmatization: #Conectamos a la Base de Datos de Semantic Scholar dbCONNECTOR = cf.get('DB', 'dbCONNECTOR') dbNAME = cf.get('DB', 'dbNAME') dbUSER = cf.get('DB', 'dbUSER') dbPASS = cf.get('DB', 'dbPASS') dbSERVER = cf.get('DB', 'dbSERVER') dbSOCKET = cf.get('DB', 'dbSOCKET') # DM = BaseDMsql(db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None, # db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS, # unix_socket=dbSOCKET) printgr('Reading Bio-Protocols from database') base_df = pd.read_csv(csv_file, low_memory=False, dtype=str) Protocol_S2 = {el[0]:el[1] for el in base_df.values.tolist()} extended_df = pd.read_csv(csv_file_extended, low_memory=False, dtype=str) base_S2 = [el[1] for el in base_df.values.tolist()] extended_S2 = [el[0] for el in extended_df.values.tolist() if el[0] not in base_S2] BIO_df = pd.DataFrame(columns=['ProtocolID', 'S2paperID', 'title', 'paperAbstract', 'procedure', 'keywords']) #Empezamos leyendo todos los artículos que no están en el dataset base # for S2id in extended_S2: # dfaux = DM.readDBtable('S2papers', limit=None, selectOptions='S2paperID, title, paperAbstract', # filterOptions='S2paperID="'+S2id+'"') # BIO_df = BIO_df.append(dfaux, ignore_index = True, sort=False) # #Now, we move to the protocols in the base dataset # protocols = [f for f in protocols_folder.iterdir() if f.name.endswith('.json')] # all_prot_data = []
============================================================ """ if lemmatization: #Conectamos a la Base de Datos de Semantic Scholar dbCONNECTOR = cf.get('DB', 'dbCONNECTOR') dbNAME = cf.get('DB', 'dbNAME') dbUSER = cf.get('DB', 'dbUSER') dbPASS = cf.get('DB', 'dbPASS') dbSERVER = cf.get('DB', 'dbSERVER') dbSOCKET = cf.get('DB', 'dbSOCKET') # DM = BaseDMsql(db_name=dbNAME, db_connector=dbCONNECTOR, path2db=None, # db_server=dbSERVER, db_user=dbUSER, db_password=dbPASS, # unix_socket=dbSOCKET) printgr('Reading Agriculture data from database') AGR_df = pd.read_csv(csv_file_extended, low_memory=False, dtype=str) AGR_S2 = AGR_df['S2paperID'].values.tolist() AGR_df = pd.DataFrame() # for S2id in AGR_S2: # dfaux = DM.readDBtable('S2papers', limit=None, selectOptions='S2paperID, title, paperAbstract', # filterOptions='S2paperID="'+S2id+'"') # AGR_df = AGR_df.append(dfaux, ignore_index = True) # print('Agriculture data loaded, #papers:', len(AGR_df)) # from lemmatizer.ENlemmatizer import ENLemmatizer # lemmas_server = cf.get('Lemmatizer', 'server') # stw_file = Path(cf.get('Lemmatizer', 'default_stw_file')) # dict_eq_file = Path(cf.get('Lemmatizer', 'default_dict_eq_file')) # POS = cf.get('Lemmatizer', 'POS') # concurrent_posts = int(cf.get('Lemmatizer', 'concurrent_posts'))