def get_gene_pmids(genes): pmids = [] for gene in genes: pmids_gene = pubmed_client.get_ids_for_gene(gene) print('%s: %d' % (gene, len(pmids_gene))) pmids += pmids_gene return list(set(pmids))
def get_pmids(gene_names): pmids = [] for gene_name in gene_names: pm = get_ids_for_gene(gene_name) pmids += pm print('%s: %d PMIDs' % (gene_name, len(pm))) return pmids
def get_pmids(ambig_terms): term_pmids = {} pmid_counter = Counter() for term in ambig_terms: key = (term.db, term.id) if term.db == 'HGNC': gene = term.entry_name try: term_pmids[key] = pubmed_client.get_ids_for_gene(gene) except ValueError: print('Could not get PMIDs for gene: %s' % gene) term_pmids[key] = [] pmid_counter.update(term_pmids[key]) time.sleep(0.5) elif term.db == 'MESH': pmids = pubmed_client.get_ids_for_mesh(term.id, major_topic=False) if len(pmids) > 1000: pmids = pubmed_client.get_ids_for_mesh(term.id, major_topic=True) term_pmids[key] = pmids[:1000] pmid_counter.update(term_pmids[key]) time.sleep(0.5) else: print('Unhandled ambiguous term: %s' % str(key)) term_pmids = { k: [p for p in pmids if pmid_counter[p] == 1] for k, pmids in term_pmids.items() } return term_pmids
def get_gene_pmids(gene_names): """Return PMIDs for all genes of interest.""" genes_pmid_list = [] for gene in gene_names: genes_pmid_list += pubmed_client.get_ids_for_gene(gene) genes_pmid_list = list(set(genes_pmid_list)) print('Found %d PMIDs for genes' % len(genes_pmid_list)) return genes_pmid_list
def get_gene_pmids(genes): all_pmids = set() for gene in genes: print(gene) pmids = pubmed_client.get_ids_for_gene(gene) all_pmids = all_pmids.union(set(pmids)) all_pmids = sorted(list(all_pmids)) return all_pmids
def get_searchgenes_pmids(search_genes, num_days): pmids = {} for s in search_genes: try: pmids[s] = pubmed_client.get_ids_for_gene(s, reldate=num_days) except ValueError as e: logger.error('Gene symbol %s is invalid') continue return pmids
def get_searchgenes_pmids(search_genes, num_days): pmids = {} for s in search_genes: try: pmids[s] = pubmed_client.get_ids_for_gene(s, reldate=num_days) except ValueError: logger.error('Gene symbol %s is invalid') continue return pmids
def get_gene_pmids(genes, out_file='pmids.txt'): all_pmids = set() for gene in genes: print(gene) pmids = pubmed_client.get_ids_for_gene(gene) all_pmids = all_pmids.union(set(pmids)) all_pmids = sorted(list(all_pmids)) with open(out_file, 'wb') as fh: for pmid in all_pmids: fh.write(('%s\n' % pmid).encode('utf-8')) return all_pmids
def get_gene_pmids(genes, out_file='pmids.txt'): all_pmids = set() for gene in genes: print(gene) pmids = pubmed_client.get_ids_for_gene(gene) all_pmids = all_pmids.union(set(pmids)) all_pmids = sorted(list(all_pmids)) with open(out_file, 'wb') as fh: for pmid in all_pmids: fh.write(('%s\n' % pmid).encode('utf-8')) return all_pmids
def get_text_content_for_gene(hgnc_name): """Get articles that have been annotated to contain gene in entrez Parameters ---------- hgnc_name : str HGNC name for gene Returns ------- text_content : list of str xmls of fulltext if available otherwise abstracts for all articles that haven been annotated in entrez to contain the given gene """ pmids = pubmed_client.get_ids_for_gene(hgnc_name) return get_text_content_for_pmids(pmids)
def get_text_content_for_gene(hgnc_name): """Get articles that have been annotated to contain gene in entrez Parameters ---------- hgnc_name : str HGNC name for gene Returns ------- text_content : list of str xmls of fulltext if available otherwise abstracts for all articles that haven been annotated in entrez to contain the given gene """ pmids = pubmed_client.get_ids_for_gene(hgnc_name) return get_text_content_for_pmids(pmids)
def get_ids(): """Search PubMed for references for the Ras 227 gene set.""" # Check if we've got the files already if os.path.isfile('reading/pmids.pkl') and \ os.path.isfile('reading/pmids_from_gene.pkl'): with open('reading/pmids.pkl') as pmids_file: pmids = pickle.load(pmids_file) with open('reading/pmids_from_gene.pkl') as pmids_from_gene_file: pmids_from_gene = pickle.load(pmids_from_gene_file) return (pmids, pmids_from_gene) # STEP 0: Get gene list gene_list = [] # Get gene list from ras_pathway_proteins.csv fname = os.path.join(indra.__path__[0], 'resources', 'ras_pathway_proteins.csv') with open(fname) as f: csvreader = csv.reader(f, delimiter='\t') for row in csvreader: gene_list.append(row[0].strip()) pmids = OrderedDict() pmids_from_gene = OrderedDict() for gene in gene_list: print("Querying for %s" % gene) ids_gene = set(pubmed_client.get_ids_for_gene(gene)) print("Found %d in gene query" % len(ids_gene)) # Hack to deal with excessive number of names if gene == 'MET': query_gene = 'CMET' elif gene == 'JUN': query_gene = 'CJUN' else: query_gene = gene ids_pubmed = set( pubmed_client.get_ids(query_gene, **{'retmax': 100000})) print("Found %d in string query" % len(ids_pubmed)) pmids[gene] = ids_pubmed pmids_from_gene[gene] = ids_gene with open('reading/pmids.pkl', 'wb') as f: pickle.dump(pmids, f) with open('reading/pmids_from_gene.pkl', 'wb') as f: pickle.dump(pmids_from_gene, f) return pmids, pmids_from_gene
def get_ids(): """Search PubMed for references for the Ras 227 gene set.""" # Check if we've got the files already if os.path.isfile('reading/pmids.pkl') and \ os.path.isfile('reading/pmids_from_gene.pkl'): with open('reading/pmids.pkl') as pmids_file: pmids = pickle.load(pmids_file) with open('reading/pmids_from_gene.pkl') as pmids_from_gene_file: pmids_from_gene = pickle.load(pmids_from_gene_file) return (pmids, pmids_from_gene) # STEP 0: Get gene list gene_list = [] # Get gene list from ras_pathway_proteins.csv with open('../../data/ras_pathway_proteins.csv') as f: csvreader = csv.reader(f, delimiter='\t') for row in csvreader: gene_list.append(row[0].strip()) pmids = OrderedDict() pmids_from_gene = OrderedDict() for gene in gene_list: print "Querying for", gene ids_gene = set(pubmed_client.get_ids_for_gene(gene)) print "Found %d in gene query" % len(ids_gene) # Hack to deal with excessive number of names if gene == 'MET': query_gene = 'CMET' elif gene == 'JUN': query_gene = 'CJUN' else: query_gene = gene ids_pubmed = set(pubmed_client.get_ids(query_gene, **{'retmax': 100000})) print "Found %d in string query" % len(ids_pubmed) pmids[gene] = ids_pubmed pmids_from_gene[gene] = ids_gene with open('reading/pmids.pkl', 'w') as f: pickle.dump(pmids, f) with open('reading/pmids_from_gene.pkl', 'w') as f: pickle.dump(pmids_from_gene, f) return (pmids, pmids_from_gene)
def test_get_ids_for_gene(): ids = pubmed_client.get_ids_for_gene('EXOC1') assert ids assert unicode_strs(ids)
def get_pmids_entrez(kinase): pmids = get_ids_for_gene(kinase) time.sleep(1) return pmids
import pickle from indra.literature import pubmed_client from indra.tools.reading import submit_reading_pipeline as sub_aws from indra.tools import assemble_corpus as ac from indra.util import write_unicode_csv basename = sys.argv[1] # Get gene list with open('genes.txt', 'rt') as f: genes = [line.strip() for line in f.readlines()] # Assemble a list of PMIDs curated in Entrez gene pmids_for_genes = {} for gene_ix, gene in enumerate(genes): try: pmids = pubmed_client.get_ids_for_gene(gene) except ValueError: print("%s: Invalid gene name, skipping" % gene) continue print("%s: %d articles" % (gene, len(pmids))) pmids_for_genes[gene] = pmids pmids = set( [pmid for pmid_list in pmids_for_genes.values() for pmid in pmid_list]) # Save the PMIDs to a file print("Saving PMIDs") with open('lab_meeting_pmids.txt', 'wt') as f: for pmid in pmids: f.write('%s\n' % pmid) #job_ids = sub_aws.submit_run_reach(basename, 'lab_meeting_pmids.txt',
dict_filename = 'pmids_for_gene.pkl' if os.path.exists(dict_filename): with open(dict_filename, 'rb') as f: pmids_for_gene = pickle.load(f) else: pmids_for_gene = {} # Get PMIDs for each HGNC ID num_added = 0 for hgnc_name in hgnc_names: # If HGN #print('Getting PMIDs for %s' % hgnc_name) if hgnc_name in pmids_for_gene: print('%s: already got PMIDs, skipping' % hgnc_name) continue try: pmids = pubmed_client.get_ids_for_gene(hgnc_name) except ValueError as ex: print("Exception in gettting PMIDs for %s: %s" % (hgnc_name, ex)) print("Continuing...") continue print('%s: %d PMIDs' % (hgnc_name, len(pmids))) pmids_for_gene[hgnc_name] = pmids num_added += 1 if num_added % 50 == 0: print("Saving info for %d genes" % len(pmids_for_gene)) with open(dict_filename, 'wb') as f: pickle.dump(pmids_for_gene, f) unique_pmids = set([ pmid for pmid_list in pmids_for_gene.values() for pmid in pmid_list ])
def test_get_ids_for_gene(): ids = pubmed_client.get_ids_for_gene('EXOC1') assert ids assert unicode_strs(ids)
def test_get_ids_for_gene(): time.sleep(0.3) ids = pubmed_client.get_ids_for_gene('EXOC1') assert ids assert unicode_strs(ids)
def test_get_ids_for_gene(): time.sleep(0.5) ids = pubmed_client.get_ids_for_gene('EXOC1') assert ids
def test_get_ids_for_gene(): time.sleep(0.3) ids = pubmed_client.get_ids_for_gene('EXOC1') assert ids assert unicode_strs(ids)