def __init__(self, study_list: List[str], user_token: str, wsc: WsClient, iac: IsaApiClient): """Init method Sets up a headers register (as we are hitting the same endpoint twice, but with different formats) and a set of base parameters for requests to the europePMC API. :param priv_list: A list of studies to iterate over, throwing each at europePMC. :param user_token: User token for use with javawebservice, must be curator or will have failed prior. :param wsc: WsClient that interfaces with the java webservice. :param iac: IsaApiClient, used to get study information. """ self.study_list = study_list self.user_token = user_token self.wsc = wsc self.iac = iac self.session = requests.Session() self.europe_pmc_url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search' self.headers_register = { 'article': { 'Accept': 'application/json' }, 'citation_ref': { 'Accept': 'application/xml' } } self.base_params = CascaDict({ 'resultType': 'core', 'format': 'JSON', 'cursorMark': '*', 'pageSize': '5', 'fromSearchPost': False, 'query': '' })
def process(self, study_id) -> List: """ Process an individual study_id from the study list. First ping our java webservice to get some basic information about the study. Then we ping the IsaApi client so that we can get title and publication information. We then iterate over the publications from the IAC, pinging europePMC for each one, creating a dict for each. :param study_id: current study_id to process. :return: List of Dicts that each represent a row in the generated report. """ row_dicts = [] self.session.headers.update(self.headers_register['article']) # kind of unsavoury to do this iteratively but saves me writing another method that does much the same thing is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = self.wsc.get_permissions(study_id, self.user_token) base_return_dict = CascaDict({ 'Identifier': study_id, 'Title': 'N/A', 'Submission Date': submission_date, 'Status': study_status, 'Release Date': release_date, 'PubmedID': 'N/A', 'DOI': 'N/A', 'Author List': 'N/A', 'Publication Date': 'N/A', 'Citing Reference': 'N/A', 'Publication in MTBLS': 'N/A', 'Journal in EuropePMC': 'N/A', 'Released before curation finished?': 'N/A' }) isa_study, isa_inv, std_path = self.iac.get_isa_study( study_id, self.user_token, skip_load_tables=True, study_location=study_location, failing_gracefully=True) # if get_isa_study has failed, isa_study will come back as None, and so we won't have any publication # information to work with. So we just return the very basic dict. if isa_study is None: row_dicts.append(base_return_dict) return row_dicts title = isa_study.title publications = isa_study.publications fresh_params = self.base_params.cascade({ 'query': title, 'format': 'JSON' }) # here we just search the article title rather than the specific publication europepmc_study_search_results = self.session.get( self.europe_pmc_url, params=fresh_params).json() # if there is an issue with query then just return the basic details dict. if 'resultList' not in europepmc_study_search_results: row_dicts.append(base_return_dict.cascade({'Title': title})) return row_dicts culled_results = [ result for result in europepmc_study_search_results['resultList'] ['result'] if fuzz.ratio(result['title'], title) > 80 ] if len(culled_results) > 0: for pub in publications: logger.info(pub) result = self.has_mapping(pub, culled_results) if result: logger.info('hit ' + str(result)) temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': result['pmid'], 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': result['journalInfo']['printPublicationDate'], 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': result['journalInfo']['journal']['title'], 'Released before curated?': self.assess_if_trangressed(study_status, result['journalInfo']) }) else: temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': pub.pubmed_id, 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': 'N/A', 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': 'N/A', 'Publication the same?': False, 'Released before curated?': 'N/A' }) row_dicts.append(temp_dict) if len(publications) is 0: row_dicts.append(base_return_dict) return row_dicts