def ukpmc(ids=None): """ Given a pubmed id, Load entities into DB from UKPMC """ ids = ids.split(",") if ids else demo_pubmeds url = 'http://ukpmc.ac.uk/abstract/MED/' import requests,re col = mongo.getCollection('publication') for id in ids: print "#### proceesing %s" %id p = col.find_one({'_id':'publ%s'%id}) pub = Publication( p ) u = "%s%s" %(url, id) r = requests.get(u) if r.status_code == 200: from django.utils.encoding import smart_str, smart_unicode content = smart_str(r.text) entities = {} for m in re.finditer(r'<span class="(disease|protein|geneOntology|species|chemical)".*?_blank">(.*?)</a></span>', content): group = m.group(1) group = 'go' if group == 'geneOntology' else group.lower() e = {'name': m.group(2).lower(), 'group': group} entities[e['name']] = e pub.entities = [] for en, item in entities.items(): pub.entities.append(item) if(pub.entities): pub.save() print("Saved %d items" %(len(entities)))
def ukpmc(ids=None): """ Given a pubmed id, Load entities into DB from UKPMC """ ids = ids.split(",") if ids else demo_pubmeds url = 'http://ukpmc.ac.uk/abstract/MED/' import requests, re col = mongo.getCollection('publication') for id in ids: print "#### proceesing %s" % id p = col.find_one({'_id': 'publ%s' % id}) pub = Publication(p) u = "%s%s" % (url, id) r = requests.get(u) if r.status_code == 200: from django.utils.encoding import smart_str, smart_unicode content = smart_str(r.text) entities = {} for m in re.finditer( r'<span class="(disease|protein|geneOntology|species|chemical)".*?_blank">(.*?)</a></span>', content): group = m.group(1) group = 'go' if group == 'geneOntology' else group.lower() e = {'name': m.group(2).lower(), 'group': group} entities[e['name']] = e pub.entities = [] for en, item in entities.items(): pub.entities.append(item) if (pub.entities): pub.save() print("Saved %d items" % (len(entities)))
def load_pubmeds(ids=None): ids = ids.split(",") if ids else demo_pubmeds url = "http://togows.dbcls.jp/entry/pubmed/$ID?format=xml" """ pub={ '_id':'', 'name':'', 'refs':{ 'pubmed': '' }, 'abstract':'', 'local': 0, 'url':'', 'published': 1, 'authors':[] } """ pc = mongo.getCollection('people') try: pc.create_index([("last", 1), ("middle", 1), ("first", 1)], unique=True) except: pass pubs = [] peoples = [] for pid in pubmeds: try: uri = url.replace('$ID', pid) print "Loading %s" % uri doc = XML2Dict().fromurl(uri) #print doc article = doc['PubmedArticleSet']['PubmedArticle'][ 'MedlineCitation']['Article'] article = doc.PubmedArticleSet.PubmedArticle.MedlineCitation.Article pub = Publication() pub._id = "publ_pubmed%s" % (pid) pub.refs = {'pubmed': pid} pub.name = article['ArticleTitle'][ 'value'] if article.ArticleTitle else '' pub.abstract = '' if article.Abstract and article.Abstract.AbstractText: texts = [ article.Abstract.AbstractText ] if not isinstance(article.Abstract.AbstractText, list) else article.Abstract.AbstractText pub.abstract = "\n\n".join([text['value'] for text in texts]) pub.language = article['Language'][ 'value'] if article.Language else '' pubs.append(pub) pub.authors = [] authors = article['AuthorList']['Author'] for author in authors: people = { 'first': author.ForeName.value if author.ForeName and author.ForeName.value else '', 'last': author.LastName.value if author.LastName and author.LastName.value else '', 'middle': author.Initials.value if author.Initials and author.Initials.value else '' } if not people['last']: continue people['namekey'] = "%s.%s.%s" % (people['first'].lower(), people['middle'].lower(), people['last'].lower()) people['_id'] = idtool.generate('peop') try: pc.insert(people, safe=True) print "Inserted %s" % people except: del people['_id'] people = pc.find_one(people) if people: pc.update({'_id': people['_id']}, {'$addToSet': { 'publications': pub._id }}, safe=True) pub.authors.append(people) #print authors except: print "ERROR: %s" % traceback.format_exc() pubc = mongo.getCollection('publication') for pub in pubs: try: pubc.insert(pub) print "Inserted pub: %s" % pub except: print "ERROR %s" % traceback.format_exc() log("Done") return pubs
def load_pubmeds(ids=None): ids = ids.split(",") if ids else demo_pubmeds url = "http://togows.dbcls.jp/entry/pubmed/$ID?format=xml" """ pub={ '_id':'', 'name':'', 'refs':{ 'pubmed': '' }, 'abstract':'', 'local': 0, 'url':'', 'published': 1, 'authors':[] } """ pc = mongo.getCollection('people') try: pc.create_index([("last", 1), ("middle",1), ("first",1)], unique=True) except: pass pubs = [] peoples = [] for pid in ids: try: uri = url.replace('$ID', pid) print "Loading %s" %uri doc = XML2Dict().fromurl(uri) #print doc article = doc['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article'] article = doc.PubmedArticleSet.PubmedArticle.MedlineCitation.Article pub = Publication() pub._id = "publ_pubmed%s" % (pid) pub.refs= {'pubmed': pid} pub.name= article['ArticleTitle']['value'] if article.ArticleTitle else '' pub.abstract = '' if article.Abstract and article.Abstract.AbstractText: texts = [ article.Abstract.AbstractText ] if not isinstance(article.Abstract.AbstractText, list) else article.Abstract.AbstractText pub.abstract= "\n\n".join([ text['value'] for text in texts ]) pub.language=article['Language']['value'] if article.Language else '' pubs.append(pub) pub.authors=[] authors = article['AuthorList']['Author'] for author in authors: people = {'first': author.ForeName.value if author.ForeName and author.ForeName.value else '', 'last': author.LastName.value if author.LastName and author.LastName.value else '', 'middle': author.Initials.value if author.Initials and author.Initials.value else '' } if not people['last']: continue people['namekey'] = "%s.%s.%s" %(people['first'].lower(), people['middle'].lower(), people['last'].lower()) people['_id'] = idtool.generate('peop') try: pc.insert(people, safe=True) print "Inserted %s" %people except: del people['_id'] people = pc.find_one(people) if people: pc.update({'_id':people['_id']}, {'$addToSet': {'publications':pub._id}}, safe=True) pub.authors.append(people) #print authors except: print "ERROR: %s" %traceback.format_exc() pubc = mongo.getCollection('publication') for pub in pubs: try: pubc.insert(pub) print "Inserted pub: %s" %pub except: print "ERROR %s" %traceback.format_exc() log("Done") return pubs