def check_trialpubs_nctids(review_id, review_doi=None, sess_id=None): """ resolve the references of a review to PMIDs and NCTIDs @param review_id: PubMed ID of review @param review_doi: DOI of review @param sess_id: session ID if transitting progress via websocket @return: namedtuple with found PMIDs and NCTIDs """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') ec = Client(api_key=eutils_key) cr = Crossref(mailto=config.MAIL_USERNAME) print('bp1') if not review_doi: while True: try: paset = ec.efetch(db='pubmed', id=review_id) break except (eutils.EutilsNCBIError, eutils.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print(e) time.sleep(5) try: pa = next(iter(paset)) except StopIteration as e: print('##EMPTY ITERATOR', e) print('retrying...') time.sleep(60) return check_trialpubs_nctids(review_id, review_doi, sess_id) if hasattr(pa, 'doi'): review_doi = pa.doi if not review_doi: if sess_id: socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id) return print('bp2') retry_attempts = 0 while True: try: if review_doi[-1] == '.': review_doi = review_doi[:-1] resp = cr.works(ids=[str(review_doi)]) break except requests.HTTPError as e: if e.response.status_code == 404: if sess_id: socketio.emit( 'crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id) print(e) return else: time.sleep(5) print('UNHANDLED HTTP ERROR', e) print('retrying...') continue except requests.exceptions.ConnectionError as e: print(e) time.sleep(10) print('connection error, retrying...') if retry_attempts >= 6: raise Exception('failed too many times') break retry_attempts += 1 print('bp3') if resp['status'] == 'ok': parsed = resp['message'] if "reference" in parsed: if sess_id: socketio.emit('crossrefbot_update', { 'msg': '%s references in crossref. trying to resolve to PubMed articles' % len(parsed['reference']) }, room=sess_id) eventlet.sleep(0) print('%s references found in crossref' % len(parsed['reference'])) to_resolve = [] references = parsed['reference'] dois = [doi["DOI"] for doi in references if 'DOI' in doi] print('bp4') if dois: # if we get pubmed metadata for these DOIs, we can cross-check which dois match the ones in our set of references # what if > 250 TODO: WARNING:eutils._internal.client:NCBI found 251 results, but we truncated the reply at 250 results; see https://github.com/biocommons/eutils/issues/124/ chunk_dois = utils.chunks(dois, 250) for dois in chunk_dois: while True: print( 'bp4.1', ' OR '.join(['"' + doi + '"[AID]' for doi in dois])) try: with eventlet.Timeout(300): esr = ec.esearch(db='pubmed', term=' OR '.join([ '"' + doi + '"[AID]' for doi in dois ])) break except (eutils.EutilsNCBIError, eutils.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError, lxml.etree.XMLSyntaxError, eventlet.timeout.Timeout) as e: print('possible timeout?', e) time.sleep(5) if esr.ids: while True: print('bp4.2', esr.ids) try: paset = ec.efetch(db='pubmed', id=esr.ids) break except (eutils.EutilsNCBIError, eutils.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, requests.exceptions.ChunkedEncodingError ) as e: print(e) time.sleep(5) pa_iter = iter(paset) while True: try: pma = next(pa_iter) except StopIteration: break if pma.doi is not None and pma.doi in dois: dois.remove(pma.doi) to_resolve.append(pma.pmid) print('bp5') remaining = [ x for x in references if ('DOI' not in x or ('DOI' in x and x['DOI'] in dois)) and ( 'first-page' in x or 'author' in x or 'article-title' in x or 'volume' in x or 'journal-title' in x or 'year' in x) ] if remaining: citation_pmids = ecitmatch_tools.batch_pmids_for_citation( remaining, debug=True) check_metadata = [] if citation_pmids: for i, citation in enumerate(citation_pmids): if utils.RepresentsInt(citation): to_resolve.append(citation) check_metadata.append(citation) continue elif citation_pmids[i].startswith('AMBIGUOUS'): cand = citation[10:].split(',') if utils.RepresentsInt(cand[0]): to_resolve.extend(cand) check_metadata.append(cand) if check_metadata: while True: try: with eventlet.Timeout(300): paset = ec.efetch(db='pubmed', id=check_metadata) break except (eutils.EutilsNCBIError, eutils.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError, eventlet.timeout.Timeout) as e: print('possible timeout?') print(e) time.sleep(5) pa_iter = iter(paset) while True: try: pma = next(pa_iter) except StopIteration: break if pma.doi is not None and pma.doi in dois: dois.remove(pma.doi) to_resolve.append(pma.pmid) print('bp6') try_doi = batch_doi2pmid(dois) if try_doi: for doi in try_doi: if utils.RepresentsInt(str(doi)): to_resolve.append(doi) nct_ids = [] for i, citation in enumerate(references): if 'unstructured' in citation.keys(): spl = citation['unstructured'].split(' ') for i in spl: if re.match(r"(NCT|nct)[0-9]{8}", i): if len(i) == 11: nct_ids.append(i) continue print('bp11') to_resolve = [str(x) for x in to_resolve] to_resolve = list(set(to_resolve)) content = collections.namedtuple('ids', ['pmids', 'nctids']) return content(to_resolve, nct_ids) return False
######################################################################################################### ##### API-key (NCBI) eclient = Client(api_key="8ecce891e7fa036ff84bccc7c74e5138dc09") #gene_efetch = eclient.efetch(db='gene', id=91039) Entrez.email = "*****@*****.**" ######################################################################################################### ##### nucleotide search ### Setting up query mRNAtranscripts = [] transcriptmRNA_esearch = eclient.esearch( db='nucleotide', term='(' + gene + '[gene] AND "H**o sapiens"[Primary Organism] AND refseq[filter]) NOT biomol_genomic[PROP]' ) print("\nLoading currently available ids from Entrez nucleotide...") print("=" * 70) print("\nTranscript variant ids: ") print(transcriptmRNA_esearch.ids) for item in transcriptmRNA_esearch.ids: mRNAtranscripts.append(item) print("\nSearch results: {}\n".format(transcriptmRNA_esearch.count)) ### Esummary for retrieving information ### For each id in mRNAtranscripts ### Save data to csv file with open('results-nucleotide.csv', mode='w') as result_nucleotide: result_writer = csv.writer(result_nucleotide, delimiter=';') result_writer.writerow([
"--max", help="number of words to test", nargs='?', const=1, type=int, default=50) parser.add_argument("-c", "--corpus", help="the corpus (brown,webtext,gutenberg)", default="brown") args = parser.parse_args() print(args.corpus) corpus = eval(args.corpus) ec = Client(api_key=api.apikey) #replace with your NCBI apikey frequency_list = FreqDist(i.lower() for i in corpus.words()) print("word\tcorpusFreq\tpubmedFreq") for word in random.sample(set(corpus.words()), args.max): freq = frequency_list[word.lower()] #let's focus on somewhat common words if (freq > 1): try: a = ec.esearch(db='pubmed', term=word) print("{}\t{}\t{}".format(word, freq, a.count)) except (TimeoutError): time.sleep(5) #slow down buddy ec = Client(api_key=api.apikey) time.sleep(0.1) #ncbi will complain otherwise
if not re.search('Ontology|Taxonomy', ont_name, flags=re.IGNORECASE): if ont_name in cits: del cits[ont_name] ont_name = ont_name+' Ontology' t1 = time.time() #print("time: {}".format(t1-t0)) if t1-t0 > 60: ec = Client(api_key=api.apikey) if ont_name in cits and cits[ont_name] > 0: pass else: rp = re.compile("^The ") ont_name = rp.sub('', ont_name) ont_name = ont_name.replace('"', '') term = "({})".format(ont_name.replace(" ", "+")) a = ec.esearch(db='pubmed', term=term) cits[ont_name] = a.count if showUids: print("{}\t{}\t{}".format(ont_name, a.count, a.ids)) else: print("{}\t{}".format(ont_name, a.count)) newcites.seek(0) json.dump(cits, newcites) except: print("I probably timed out again or whatever...lemme catch my breath") time.sleep(2) newcites.seek(0) json.dump(cits, newcites) ec = Client(api_key=api.apikey) print("I identified {} ontologies".format(len(cits.keys())))
else: try: r[key] = str(r[key]) except UnicodeEncodeError: r[key] = r[key].encode('utf-8') if callable(record): r = record(r) elif record is not None: raise ValueError('Unknown record transform function (args.record).') if r: writer.write(r) client = Client(api_key = apikey) if prog == 'esearch': sret = client.esearch(db = db, term = term) try: error = list(sret._xml_root.find('ErrorList').iterchildren()) except: error = None print sret.count if not error else 0 if not sret.ids: rets = [] else: rets = client.efetch(db = db, id = sret.ids) rets = list(iter(rets)) writerResults(rets) else:
def check_trialpubs_nctids(review_id, review_doi=None, sess_id=None): """ resolve the references of a review to PMIDs and NCTIDs @param review_id: PubMed ID of review @param review_doi: DOI of review @param sess_id: session ID if transitting progress via websocket @return: namedtuple with found PMIDs and NCTIDs """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') ec = Client(api_key=eutils_key) cr = Crossref(mailto=config.MAIL_USERNAME) if not review_doi: while True: try: paset = ec.efetch(db='pubmed', id=review_id) break except ( eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print e time.sleep(5) pa = iter(paset).next() if hasattr(pa, 'doi'): review_doi = pa.doi if not review_doi: if sess_id: socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id) return try: if review_doi[-1] == '.': review_doi = review_doi[:-1] resp = cr.works(ids=[str(review_doi)]) except requests.HTTPError as e: if sess_id: socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id) print e return if resp['status'] == 'ok': parsed = resp['message'] if "reference" in parsed: if sess_id: socketio.emit('crossrefbot_update', {'msg': str(len(parsed[ 'reference'])) + ' references found in crossref. trying to resolve these to PubMed articles...'}, room=sess_id) eventlet.sleep(0) print str(len(parsed['reference'])) + ' references found in crossref' to_resolve = [] references = parsed['reference'] dois = [doi["DOI"] for doi in references if 'DOI' in doi] if dois: # if we get pubmed metadata for these DOIs, we can cross-check which dois match the ones in our set of references # what if > 250 chunk_dois = utils.chunks(dois, 250) for dois in chunk_dois: while True: try: esr = ec.esearch(db='pubmed', term=' OR '.join(['"' + doi + '"[AID]' for doi in dois])) break except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError, lxml.etree.XMLSyntaxError) as e: print e time.sleep(5) if esr.ids: while True: try: paset = ec.efetch(db='pubmed', id=esr.ids) break except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print e time.sleep(5) pa_iter = iter(paset) while True: try: pma = pa_iter.next() except StopIteration: break if pma.doi is not None and pma.doi in dois: dois.remove(pma.doi) to_resolve.append(pma.pmid) remaining = [x for x in references if ('DOI' not in x or ('DOI' in x and x['DOI'] in dois)) and ( 'first-page' in x or 'author' in x or 'article-title' in x or 'volume' in x or 'journal-title' in x or 'year' in x)] if remaining: citation_pmids = ecitmatch_tools.batch_pmids_for_citation(remaining, debug=False) check_metadata = [] if citation_pmids: for i, citation in enumerate(citation_pmids): if utils.RepresentsInt(citation): to_resolve.append(citation) check_metadata.append(citation) continue elif citation_pmids[i].startswith('AMBIGUOUS'): cand = citation[10:].split(',') if utils.RepresentsInt(cand[0]): to_resolve.extend(cand) check_metadata.append(cand) if check_metadata: while True: try: paset = ec.efetch(db='pubmed', id=check_metadata) break except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print e time.sleep(5) pa_iter = iter(paset) while True: try: pma = pa_iter.next() except StopIteration: break if pma.doi is not None and pma.doi in dois: dois.remove(pma.doi) to_resolve.append(pma.pmid) try_doi = batch_doi2pmid(dois) if try_doi: for doi in try_doi: if utils.RepresentsInt(str(doi)): to_resolve.append(doi) nct_ids = [] for i, citation in enumerate(references): if 'unstructured' in citation.keys(): spl = citation['unstructured'].split(' ') for i in spl: if re.match(r"(NCT|nct)[0-9]{8}", i): if len(i) == 11: nct_ids.append(i) continue to_resolve = [str(x) for x in to_resolve] to_resolve = list(set(to_resolve)) content = collections.namedtuple('ids', ['pmids', 'nctids']) return content(to_resolve, nct_ids) return False