def find_missing_ids(self): id_fields = [ self.to_mongo().get(x, None) for x in ['doi', 'pubmed_id', 'pmcid'] ] ids_not_none = [x is not None for x in id_fields] #We need at least one of the id fields complete in order to find the others if not all(ids_not_none) and any(ids_not_none): present_id = next(x for x in id_fields if x is not None) remaining_ids = find_remaining_ids(present_id) for k, v in remaining_ids.items(): if v is not None: self[k] = v
def _parse_pubmed_id(self, doc): """ Returns the PubMed ID of a document as a <class 'str'>.""" return find_remaining_ids(self._parse_doi(doc))['pubmed_id']
def _parse_pmcid(self, doc): """ Returns the pmcid of a document as a <class 'str'>.""" return find_remaining_ids(self._parse_doi(doc))['pmcid']
def _parse_doi(self, doc): """ Returns the DOI of a document as a <class 'str'>""" doi_fetch = find_remaining_ids(str(doc['pmid'])).get('doi', None) if doi_fetch != None: return doi_fetch return None
def _parse_pmcid(self, doc): """ Returns the pmcid of a document as a <class 'str'>.""" try: return doc['pmcid'] except: return find_remaining_ids(str(doc['pmid']))['pmcid']
def _parse_pubmed_id(self, doc): """ Returns the PubMed ID of a document as a <class 'str'>.""" if 'pmid' in doc.keys(): if doc['pmcid'] != '': return doc['pmcid'] return find_remaining_ids(self._parse_doi(doc))['pubmed_id']
def update_PubMed_entries(mongo_db): col_name = 'CDCN_extracted_PubMed' col = mongo_db[col_name] col_entries = mongo_db['entries'] col_entries_vespa = mongo_db['entries_vespa'] query = col.find({}, ) print('query.count()', query.count()) found_entries = set() found_entries_vespa = set() for doc in query: PMID = doc['PMID'] ids = find_remaining_ids(PMID) doi = ids['doi'] drugs = doc.get('Repurposed Drug Name', None) if not drugs: continue # TODO: remove drug name NR if isinstance(drugs, str): drugs = [drugs] if doi is not None: print('doi', doi) col.find_one_and_update({'_id': doc['_id']}, {'$set': { 'doi': doi }}) # # update entries # if doi is not None: # entry = col_entries.find_one({'doi': doi}) # if entry: # col_entries.find_one_and_update( # { # '_id': entry['_id'] # }, # { # "$set": { # 'drug_names': drugs # }, # } # ) # found_entries.add(entry['_id']) # # # update vespa entries # if PMID is not None: # entry = col_entries_vespa.find_one({'pubmed_id': PMID}) # if entry: # col_entries_vespa.find_one_and_update( # { # '_id': entry['_id'] # }, # { # "$set": { # 'drug_names': drugs # }, # } # ) # found_entries_vespa.add(entry['_id']) # # if doi is not None: # entry = col_entries_vespa.find_one({'doi': doi}) # if entry: # col_entries_vespa.find_one_and_update( # { # '_id': entry['_id'] # }, # { # "$set": { # 'drug_names': drugs # }, # } # ) # found_entries_vespa.add(entry['_id']) print('found_entries', len(found_entries)) print('found_entries_vespa', len(found_entries_vespa))