Exemplo n.º 1
0
def main():
	conf = Config()

	logging.basicConfig(format = "%(asctime)s %(name)s %(levelname) -5s : %(message)s")
	log = logging.getLogger("reload_pubmed")
	log.setLevel(logging.DEBUG)

	pubmed = Pubmed()

	log.info("Connecting ...")

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	update_cursor = conn.cursor()

	log.info("Querying experiments ...")

	cursor.execute("""
		select id, pub_pubmed, study_id, platf_id
		from ent_experiment where pub_pubmed is not NULL""")

	SPACES = re.compile("\s+")

	row = cursor.fetchone()
	while row is not None:
		id, pmid, study_id, platf_id = row

		log.info(">>> PMID: {}, STUDY: {}, PLATFORM: {}".format(pmid, study_id, platf_id))

		pub = pubmed.find(pmid)
		if pub is None:
			log.error("PMID not found: {}".format(pmid))
			continue

		pub = pub[0]
		for k, v in pub.items():
			if v is not None and isinstance(v, basestring):
				pub[k] = v.replace("'", r"\'")

		sql = u"""
			update ent_experiment
			set pub_title='{}', pub_authors='{}', pub_year='{}', pub_journal='{}'
			where id={}""".format(
				pub["title"], pub["short_authors"],
				pub["date"], pub["journal"], id)

		log.debug(SPACES.sub(" ", sql.strip()))

		update_cursor.execute(sql)

		row = cursor.fetchone()
	
	cursor.close()
	update_cursor.close()
	conn.close()
Exemplo n.º 2
0
 def process(self):
   params=urlparse.parse_qs(urlparse.urlparse(self.path).query)
   pm=Pubmed("*****@*****.**")
   pm.query(params["query"])
   pm.fetch()
   pm.parse()
   return json.dumps(convert_articles(pm.articles))
Exemplo n.º 3
0
    def get_pubmed_words(self, pubmed_id):
        ''' 
        return a dict in the same format as get_field_words: k=field, v=sanitized list of words
        '''
        words = dict()
        pubmed = Pubmed(pubmed_id).populate()
        for tag in Pubmed.text_tags:
            try:
                words[tag] = getattr(pubmed, tag)

            except AttributeError as ae:
                pass

        return words
Exemplo n.º 4
0
    def insert(self, pmid, user):

        medline = FetchMedline([pmid])
        records = medline.get_records()

        ## it is weird you can't do record = records[0]??
        for rec in records:
            record = rec

        # get pubmed instance
        pubmed = Pubmed(record)

        # insert journal
        journal_no = Journal.insert(pubmed.journal_abbrev, user)

        # insert reference
        ref_no = 0
        ref_query = self.query.filter_by(pubmed=pmid)
        if ref_query.first():
            ref_no = ref_query.first().reference_no
        else:
            ref_entry = self(user, pubmed.publish_status, pubmed.citation,
                             pubmed.year, pmid, 'PubMed script',
                             pubmed.pdf_status, pubmed.pages, pubmed.volume,
                             pubmed.title, pubmed.issue, journal_no)
            db.session.add(ref_entry)
            db.session.commit()
            ref_no = ref_entry.reference_no

        # insert abstract
        Abstract.insert(ref_no, pubmed.abstract_txt)

        # insert author
        order = 0
        for name in pubmed.authors:
            order += 1
            author_no = Author.insert(name, user)
            AuthorEditor.insert(author_no, ref_no, order)

        # insert ref_type
        RefType.insert(pubmed.pub_type, ref_no, 'NCBI', user)

        return ref_no
Exemplo n.º 5
0
# First download nltk stuffs
home=os.environ["HOME"]
if not os.path.exists("%s/nltk_data" %home):
   import nltk
   nltk.download('all')

# Download neurosynth data
df = pandas.read_csv("database.txt",sep="\t")
pmids = df.id.unique().tolist()

print "NeuroSynth database has %s unique PMIDs" %(len(pmids))

# download abstract text
email = "*****@*****.**"
pm = Pubmed(email,pmc=False)
articles1 = pm.get_many_articles(pmids[:10000])
articles2 = pm.get_many_articles(pmids[10000:])
articles = articles1.copy()
articles.update(articles2)

if not os.path.exists("articles.pkl"):
    pickle.dump(articles,open("articles.pkl","wb"))

# Write articles to file
#88390|"<text><p>sentence1</p><p>sentence2</p><p></text>"
#88390|"<text><p>sentence1</p><p>sentence2</p><p></text>"
# We should use utf-8 http://www.postgresql.org/docs/9.0/static/multibyte.html

filey = open(output_file,"wb")
count = 0
Exemplo n.º 6
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	if "biomart.study_source" in conf:
		study_source_map = conf["biomart.study_source"]
	else:
		study_source_map = conf.create_element()

	log = task.logger()

	exp_port = task.ports("experiment")
	
	es = EntityServer(conf["entities"])
	em = es.manager()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE ent_experiment (
		  id int(11) NOT NULL,
		  exp_name varchar(64) NOT NULL,
		  study_id varchar(32) NOT NULL,
		  study_source varchar(32) DEFAULT NULL,
		  study_source_url varchar(512) DEFAULT NULL,
		  study_link varchar(512) DEFAULT NULL,
		  pub_pubmed varchar(32) DEFAULT NULL,
		  pub_title varchar(300) DEFAULT NULL,
		  pub_authors varchar(300) DEFAULT NULL,
		  pub_year varchar(16) DEFAULT NULL,
		  pub_journal varchar(200) DEFAULT NULL,
		  platf_id varchar(32) NOT NULL,
		  platf_title varchar(250) DEFAULT NULL,
		  platf_technology varchar(96) DEFAULT NULL,
		  PRIMARY KEY (id),
		  KEY exp_name (exp_name),
		  KEY pub_pubmed (pub_pubmed),
		  KEY pub_title (pub_title),
		  KEY pub_authors (pub_authors),
		  KEY pub_year (pub_year),
		  KEY pub_journal (pub_journal),
		  KEY platf_title (platf_title),
		  KEY platf_technology (platf_technology)
		) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine))

	ib = BatchInsert(cursor, "ent_experiment",
			["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link",
				"pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal",
				"platf_id", "platf_title", "platf_technology"], insert_size)

	pubmed = Pubmed()

	for i, exp in enumerate(exp_port, 1):
		study_id = exp[0]
		platform_id = exp[1]

		study = em.find(study_id, types.SOURCE_STUDY)
		if study is None:
			log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id))
			continue

		platf = em.find(platform_id, types.SOURCE_PLATFORM)
		if platf is None:
			log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id))
			continue

		log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id))

		pub = {}
		for k in ["title", "short_authors", "date", "journal"]:
			pub[k] = None

		if "pubmed" in study:
			pmid = study["pubmed"]
			if isinstance(pmid, (DataElementList, list)):
				pmid = pmid[0]
				log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid))

			log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid))
			try:
				pub = pubmed.find(pmid)
				if len(pub) == 0:
					log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id))
				else:
					pub = pub[0]
			except Exception as ex:
				log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid))
				log.exception(ex)
		else:
			pmid = None
			log.warn("Study {} has no 'pubmed_id' annotation".format(study_id))

			if "title" not in study:
				log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id))
			elif "SO/contact_details[0]/contact_name" not in study \
					and "SO/contact_details/contact_name" not in study:
				log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id))
			else:
				try:
					pub["title"] = study["title"]

					if "SO/contact_details[0]/contact_name" in study:
						pub["short_authors"] = study["SO/contact_details[0]/contact_name"]
					else:
						pub["short_authors"] = study["SO/contact_details/contact_name"]

					if "SO/submission/pub_date" in study:
						pub["date"] = study["SO/submission/pub_date"]
					else:
						pub["date"] = ""
				except Exception as ex:
					log.debug(study)
					log.execption(ex)

		for k, v in pub.items():
			if v is not None and isinstance(v, basestring):
				pub[k] = v.replace("'", r"\'")

		exp_name = "{}; {}".format(study_id, platform_id)

		study_source = None
		study_source_url = None
		study_link = None

		parts = study_id.split("-")
		if len(parts) >= 2 and parts[0] in study_source_map:
			ss = study_source_map[parts[0]]
			study_source = ss.get("name")
			study_source_url = ss.get("home_url")
			try:
				study_link = ss.get("link", "").format(parts[1])
			except:
				pass

		ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link,
			pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"],
			platform_id, platf["SO/platform_title"], "")

	log.debug("{} experiments inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()