예제 #1
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)
	
	log = task.logger()

	id_port = task.ports("id")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values())

	feat_ids = {}

	for name in table_infixs:
		if name == "gene":
			continue
			
		cursor.execute("""
			CREATE TABLE IF NOT EXISTS exp_{0}_trs (
			  {0}_id int(11) NOT NULL,
			  icdo_id int(11) NOT NULL,
			  exp_id int(11) NOT NULL,
			  upreg_total int(11) DEFAULT NULL,
			  upreg_observed double DEFAULT NULL,
			  upreg_expected double DEFAULT NULL,
			  upreg_stdev double DEFAULT NULL,
			  upreg_pvalue double DEFAULT NULL,
			  upreg_cpvalue double DEFAULT NULL,
			  downreg_total int(11) DEFAULT NULL,
			  downreg_observed double DEFAULT NULL,
			  downreg_expected double DEFAULT NULL,
			  downreg_stdev double DEFAULT NULL,
			  downreg_pvalue double DEFAULT NULL,
			  downreg_cpvalue double DEFAULT NULL,
			  PRIMARY KEY ({0}_id,icdo_id,exp_id),
			  KEY icdo (icdo_id,exp_id),
			  KEY exp (exp_id),
			  CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id),
			  CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
			  CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
			) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine))

		feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name))

	icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
	exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

	for id_type, eid in id_port:
		e = em.find(eid, types.MRNA_ENRICHMENT)
		if e is None:
			log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid))
			continue

		if "results_file" not in e:
			log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid))
			continue

		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e["icdo_morphology"]

		okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type)

		log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid))

		table_infix = ID_TYPE_TO_TABLE_INFIX[id_type]

		icdo_key = (icdo_topography, icdo_morphology)
		if icdo_key not in icdo:
			log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
			continue
		icdo_id = icdo[icdo_key]

		exp_key = (study_id, platform_id)
		if exp_key not in exp:
			log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
			continue
		exp_id = exp[exp_key]

		ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix),
				["{}_id".format(table_infix), "icdo_id", "exp_id",
						"upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue",
						"downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size)

		results_repo, results_path = rs.from_url(e["results_file"])

		try:
			reader = results_repo.open_reader(results_path)
		except Exception as ex:
			log.exception(ex)
			ib.close()
			results_repo.close()
			continue
		
		# read header
		hdr_map = {}
		hdr = reader.readline().rstrip().split("\t")
		for i, name in enumerate(hdr):
			hdr_map[name] = i

		try:
			col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
		except KeyError as e:
			log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
			reader.close()
			ib.close()
			results_repo.close()
			continue

		skipped_ids = set()

		fids = feat_ids[table_infix]

		# read data
		for line in reader:
			line = line.rstrip()
			data = line.split("\t")
			feat_name = data[0]
			data = [data[i] for i in col_indices]
			if feat_name not in fids:
				skipped_ids.add(feat_name)
				continue

			feat_id = fids[feat_name]
			
			ib.insert(feat_id, icdo_id, exp_id, *data)

		if len(skipped_ids) > 0:
			log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids)))

		log.debug("{} results inserted".format(ib.count))

		ib.close()
		reader.close()

	em.close()
	es.close()
	rs.close()
예제 #2
0
def main():
    task.check_conf(["entities", "repositories", "biomart.db"])
    conf = task.conf

    insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

    db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

    log = task.logger()

    oncodrive_port = task.ports("id")

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    conn = biomart_db_connect(conf["biomart.db"], log)

    cursor = conn.cursor()

    gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene")
    icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
    exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

    cursor.execute(
        """
		CREATE TABLE IF NOT EXISTS exp_gene_trs (
		  gene_id int(11) NOT NULL,
		  icdo_id int(11) NOT NULL,
		  exp_id int(11) NOT NULL,
		  upreg_total int(11) DEFAULT NULL,
		  upreg_observed double DEFAULT NULL,
		  upreg_expected double DEFAULT NULL,
		  upreg_stdev double DEFAULT NULL,
		  upreg_pvalue double DEFAULT NULL,
		  upreg_cpvalue double DEFAULT NULL,
		  downreg_total int(11) DEFAULT NULL,
		  downreg_observed double DEFAULT NULL,
		  downreg_expected double DEFAULT NULL,
		  downreg_stdev double DEFAULT NULL,
		  downreg_pvalue double DEFAULT NULL,
		  downreg_cpvalue double DEFAULT NULL,
		  PRIMARY KEY (gene_id,icdo_id,exp_id),
		  KEY icdo (icdo_id,exp_id),
		  KEY exp (exp_id),
		  CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id),
		  CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
		  CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(
            db_engine
        )
    )

    cursor.execute("LOCK TABLES exp_gene_trs WRITE")

    lock_count = 0

    for eid in oncodrive_port:
        e = em.find(eid, types.MRNA_ONCODRIVE_GENES)
        if e is None:
            log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        if "results_file" not in e:
            log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        study_id = e["study_id"]
        platform_id = e["platform_id"]
        icdo_topography = e["icdo_topography"]
        icdo_morphology = e["icdo_morphology"]

        okey = (study_id, platform_id, icdo_topography, icdo_morphology)

        log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid))

        icdo_key = (icdo_topography, icdo_morphology)
        if icdo_key not in icdo:
            log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
            continue
        icdo_id = icdo[icdo_key]

        exp_key = (study_id, platform_id)
        if exp_key not in exp:
            log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
            continue
        exp_id = exp[exp_key]

        ib = BatchInsert(
            cursor,
            "exp_gene_trs",
            [
                "gene_id",
                "icdo_id",
                "exp_id",
                "upreg_total",
                "upreg_observed",
                "upreg_expected",
                "upreg_stdev",
                "upreg_pvalue",
                "upreg_cpvalue",
                "downreg_total",
                "downreg_observed",
                "downreg_expected",
                "downreg_stdev",
                "downreg_pvalue",
                "downreg_cpvalue",
            ],
            insert_size,
        )

        results_repo, results_path = rs.from_url(e["results_file"])

        try:
            reader = results_repo.open_reader(results_path)
        except Exception as ex:
            log.exception(ex)
            ib.close()
            results_repo.close()
            continue

            # read header
        hdr_map = {}
        hdr = reader.readline().rstrip().split("\t")
        for i, name in enumerate(hdr):
            hdr_map[name] = i

        try:
            col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
        except KeyError as e:
            log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
            reader.close()
            lock_count += ib.count
            ib.close()
            results_repo.close()
            continue

        skipped_genes = set()

        # read data
        for line in reader:
            line = line.rstrip()
            data = line.split("\t")
            gene_name = data[0]
            data = [data[i] for i in col_indices]
            if gene_name not in gene:
                skipped_genes.add(gene_name)
                continue

            gene_id = gene[gene_name]

            ib.insert(gene_id, icdo_id, exp_id, *data)

        if len(skipped_genes) > 0:
            log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes)))

        log.debug("{} gene results inserted".format(ib.count))

        lock_count += ib.count

        ib.close()
        reader.close()

        if lock_count >= 1000000:
            cursor.execute("UNLOCK TABLES")
            cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
            cursor.execute("LOCK TABLES exp_gene_trs WRITE")
            lock_count = 0

    cursor.execute("UNLOCK TABLES")
    cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
    cursor.close()

    em.close()
    es.close()
    rs.close()