def get_all_manual_exptl_AE(con, tstat=0): """Finds all manual experimental annotations with an annotation_extension, Returns a tab object of them, with all AEs unrolled to -> one per line""" pre = '' post = '' if tstat: pre = "SELECT * FROM (" post = ") WHERE ROWNUM <= 100" cursor = con.cursor() query = "%s SELECT a.ENTITY_ID as GP_ID, a.GO_ID, t.name as GO_NAME, e2g.code as EVIDENCE, " \ "a.REF_DB_CODE AS REF_TYPE, a.REF_DB_ID AS REF_ACC, a.ANNOTATION_EXTENSION, a.SOURCE " \ "from go.v_manual_annotations a " \ "join go.terms t on (t.go_id = a.go_id) " \ "join GO.EVIDENCE2ECO e2g ON (e2g.eco_id = a.eco_id) " \ "where a.is_public = 'Y' " \ "and a.annotation_extension is not null %s" % (pre, post) cursor.execute(query) # Specify headers in order to set order of columns for printing. results_tab = tab(headers = ['SOURCE', 'GP_ID', 'GO_ID', 'GO_NAME', 'ANNOTATION_EXTENSION', 'EVIDENCE', 'REF_TYPE', 'REF_ACC']) for r in dict_cursor(cursor): if r['ANNOTATION_EXTENSION']: results_tab.tab.extend(unroll_AE(r)) else: results_tab.tab.append(r) results_tab.validate() return results_tab
def get_all_manual_exptl_AE(con, tstat=0): """Finds all manual experimental annotations with an annotation_extension, Returns a tab object of them, with all AEs unrolled to -> one per line""" pre = '' post = '' if tstat: pre = "SELECT * FROM (" post = ") WHERE ROWNUM <= 100" cursor = con.cursor() query = "%s SELECT a.ENTITY_ID as GP_ID, a.GO_ID, t.name as GO_NAME, e2g.code as EVIDENCE, " \ "a.REF_DB_CODE AS REF_TYPE, a.REF_DB_ID AS REF_ACC, a.ANNOTATION_EXTENSION, a.SOURCE " \ "from go.v_manual_annotations a " \ "join go.terms t on (t.go_id = a.go_id) " \ "join GO.EVIDENCE2ECO e2g ON (e2g.eco_id = a.eco_id) " \ "where a.is_public = 'Y' " \ "and a.annotation_extension is not null %s" % (pre, post) cursor.execute(query) # Specify headers in order to set order of columns for printing. results_tab = tab(headers=[ 'SOURCE', 'GP_ID', 'GO_ID', 'GO_NAME', 'ANNOTATION_EXTENSION', 'EVIDENCE', 'REF_TYPE', 'REF_ACC' ]) for r in dict_cursor(cursor): if r['ANNOTATION_EXTENSION']: results_tab.tab.extend(unroll_AE(r)) else: results_tab.tab.append(r) results_tab.validate() return results_tab
def __main__(): ont = Brain() ont.learn(sys.argv[1]) qtab = tab("./", "queries.tsv") for r in qtab.tab: q = Query(r, ont) q.qtest() ont.sleep()
def __init__(self, manual_map, owl_map, RCV, go): """manual_map is a list of dicts containing the manual mapping table; owl_map is a dict(row) of dicts(columns) containing the OWL mapping table, RCV is the Roche term table as a dict of dicts go is a Brain object containing the ontology used for mapping.""" self.manual_map = manual_map self.owl_map = owl_map self.go = go self.rcv = RCV self.obs_status = {} # A dictionary of manually mapped GO terms, with value = is obsolete True/False self.update_manual_map_obs_stat() self.combined_results = tab() self.combined_results.headers = ["RCV_ID", "RCV_NAME", "GO_ID", "GO_NAME", "STATUS"]
def plot_count(column, stats, path): nc = 'Number ' + column plot = tab(headers = ['Number RCV', nc]) clist = stats.extract_column(column) c = Counter(clist) for l,n in c.iteritems(): d = {} d['Number RCV'] = l d[nc] = n plot.tab.append(d) plot.validate out_plot = open(re.sub(' ', '_', column) + "_plot.tsv", "w") out_plot.write(plot.print_tab(sort_keys = [nc])) out_plot.close() return plot
def __init__(self, manual_map, owl_map, RCV, go): """manual_map is a list of dicts containing the manual mapping table; owl_map is a dict(row) of dicts(columns) containing the OWL mapping table, RCV is the Roche term table as a dict of dicts go is a Brain object containing the ontology used for mapping.""" self.manual_map = manual_map self.owl_map = owl_map self.go = go self.rcv = RCV self.obs_status = { } # A dictionary of manually mapped GO terms, with value = is obsolete True/False self.update_manual_map_obs_stat() self.combined_results = tab() self.combined_results.headers = [ "RCV_ID", "RCV_NAME", "GO_ID", "GO_NAME", "STATUS" ]
def plot_count(column, stats, path): nc = 'Number ' + column plot = tab(headers=['Number RCV', nc]) clist = stats.extract_column(column) c = Counter(clist) for l, n in c.iteritems(): d = {} d['Number RCV'] = l d[nc] = n plot.tab.append(d) plot.validate out_plot = open(re.sub(' ', '_', column) + "_plot.tsv", "w") out_plot.write(plot.print_tab(sort_keys=[nc])) out_plot.close() return plot
def gen_report_tab(od): headers = ['a.annotation_type', 'a.text', 'op_label', 'op_id', 'class_label', 'class_id'] # Doesn't seem right to set this here... report = tab(headers = headers) dc = od.gen_annotation_report() for d in dc: row = {} row['a.annotation_type'] = d['annotation_class'] row['a.text'] = d['annotation_text'] row['op_label'] = d['op_label'] if d['op_id'] == None: row['op_id'] = '' else: row['op_id'] = d['op_id'] row['class_label'] = d['class_label'] row['class_id'] = d['class_id'] report.tab.append(row) return report
def get_compact_AE(con): """Finds distinct set of GO + AE from manual experimental annotations with an annotation_extension, Returns a tab object of them, with all AEs unrolled to -> one per line""" cursor = con.cursor() cursor.execute("SELECT DISTINCT a.GO_ID, t.name as GO_NAME, a.ANNOTATION_EXTENSION " \ "FROM go.v_manual_annotations a " \ "JOIN go.terms t ON (t.go_id = a.go_id)" \ "JOIN go.eco_terms et ON (a.ECO_ID = et.ECO_ID) " \ "WHERE a.is_public = 'Y'" \ "and a.annotation_extension is not null ") results_tab = tab(headers=['GO_ID', 'GO_NAME', 'ANNOTATION_EXTENSION']) for r in dict_cursor(cursor): if r['ANNOTATION_EXTENSION']: results_tab.tab.extend(unroll_AE(r)) else: results_tab.tab.append(r) results_tab.validate() return results_tab
def get_compact_AE(con): """Finds distinct set of GO + AE from manual experimental annotations with an annotation_extension, Returns a tab object of them, with all AEs unrolled to -> one per line""" cursor = con.cursor() cursor.execute("SELECT DISTINCT a.GO_ID, t.name as GO_NAME, a.ANNOTATION_EXTENSION " \ "FROM go.v_manual_annotations a " \ "JOIN go.terms t ON (t.go_id = a.go_id)" \ "JOIN go.eco_terms et ON (a.ECO_ID = et.ECO_ID) " \ "WHERE a.is_public = 'Y'" \ "and a.annotation_extension is not null ") results_tab = tab(headers = ['GO_ID', 'GO_NAME', 'ANNOTATION_EXTENSION']) for r in dict_cursor(cursor): if r['ANNOTATION_EXTENSION']: results_tab.tab.extend(unroll_AE(r)) else: results_tab.tab.append(r) results_tab.validate() return results_tab
""" sys.path.append("../mod/") con = get_con(usr = sys.argv[1] , pwd = sys.argv[2]) # connection to LMB DB. Need to make ssh tunnel first. ontologies = Brain() # Construct Brain object # Now load up ontologies. These are used to check validity for addition of new classes or # relations to DB. You can load as many as you need. ontologies.learn("http://purl.obolibrary.org/obo/fbbt/fbbt-simple.owl") # Switch to specific release if necessary. odbo = owlDbOnt(conn = con, ont = ontologies) # Object for interacting with the database + ontologies. # Also detects anything that looks like a FlyBase feature and checks validity against public FlyBase. annotation_table = tab(path = sys.argv[3], file_name=sys.argv[4]) # tsv file with headers: ind_name, class, rel, ind_source # ind_source must already be in the DB ID_range_start = 20000 for row in annotation_table.tab: print str(row) new_ind = odbo.add_ind(name = row['ind_name'], source = row['ind_source'], ID_range_start = 20000) # Returns FALSE and warns if addn fails if not new_ind: new_ind = odbo.ind_NameId[row['ind_name']] warnings.warn("Assuming existing individual called %s (%s) is the correct one, and adding types accordingly." % (new_ind, row['ind_name'])) print new_ind odbo.add_ind_type(ind = new_ind, OWLclass = row['class'], objectProperty = row['rel']) con.close()
# Rather scrappy, Perlish procedural code for generating mappings. Annoyingly monolithic: Have to run all mappings or none. """Reads owl_map and uses it to automatically populate RCV classes. Compares these to manual mappings. Prints a results summary and results tables. Ontology to use must be specified as argv[1] when runnning this script.""" from mapping_tools import (map_obj, load_ont, mappingTabs) from tsv2pdm import tab, rcd go = load_ont(sys.argv[1]) manMap = tab('../mapping_tables/', 'manual_mapping.tsv') # No key row. Stored as list of dicts. owlMap = rcd('../mapping_tables/', 'owl_map.tsv', 'RCV_ID') # dict of dicts. RCV = rcd('../mapping_tables/', 'RocheCV_def.tsv', 'RCV_ID') # dict of dicts. mapping_tabs = mappingTabs(manMap.tab, owlMap.rowColDict, RCV.rowColDict, go) # ...Hmmm - would give much more flexibility if passed objects rather than data structures. manMap_updated = open('../mapping_tables/manual_mapping.tsv', "w") manMap_updated.write(manMap.print_tab(sort_keys=('RCV_ID',))) manMap_updated.close() RCV_id_name = {} # Residual perlishness ? for row in manMap.tab: RCV_id_name[row['RCV_ID']]=row['RCV_NAME'] report_path = '../mapping_tables/results/'
from tsv2pdm import tab from json_tree_tools import blank_treeContent_node, write_json, add_leaf, load_json, roll_readable_tree, get_nodeId_name import operator # Assume existing, valid domain tree files # Add two new nodes - one for tracts and one for neuropils? - But do we have a terms? # Definitely not for neuropils... # Or could try interleaving node. # TO do this, need a list of subnodes # # Or just bung them on the end... dts = load_json("../BrainName_domains/json/treeStructure.jso") dtc = load_json("../BrainName_domains/json/treeContent.jso") ttc_tab = tab("../BrainName_tracts/", "domain_data.tsv") dlist = [] for d in dtc: if 'extId' in d.keys(): if len(d['extId']) > 0: dlist.append(d['extId'][0]) # sort table on name field (ultimately) => alphabetic ordering of tree ttc_tab.tab.sort(key=operator.itemgetter('name')) D_nodeIds = [] for n in dtc: D_nodeIds.append(int(n['nodeId'])) i = max(D_nodeIds) + 1
from tsv2pdm import tab from json_tree_tools import blank_treeContent_node, write_json, add_leaf, init_treeStructure import operator from __builtin__ import str # Aim = single root: adult brain. With an alphanumeric list of tracts underneath tc_tab = tab("./", "domain_data.tsv") # sort table on name field (ultimately) => alphabetic ordering of tree tc_tab.tab.sort(key=operator.itemgetter('name')) tc = [] i = 1 ts = init_treeStructure() adult_brain_node = blank_treeContent_node(nodeId='0', name = 'adult brain', oboID = 'FBbt:00003624') tc.append(adult_brain_node) for r in tc_tab.tab: if r['oboId']: n = blank_treeContent_node(domainId=r['domainID'], nodeId=str(i), name = r['name'], oboID = r['oboId'], color = r['domainColour'], centre = r['domainCentre']) tc.append(n) add_leaf(str(i), ts, '0') i += 1 write_json(json_var = tc, path = "json/treeContent.jso") write_json(json_var = ts, path = "json/treeStructure.jso")
deleted = ct.tab1_only() # Only in the update tab new = ct.tab2_only() for r in new.tab: warnings.warn("Processes %s" % r) if r['class_id']: od.add_akv_type(key = r['a.annotation_type'], value =r['a.text'] , OWLclass = r['class_id'], objectProperty =r['op_id'] ) else: for r in deleted.tab: if not safe_mode: od.remove_akv_type(key = r['a.annotation_type'], value =r['a.text'] , OWLclass = r['class_id'], objectProperty =r['op_id'] ) else: warnings.warn("Row present in DB, now missing from mapping: %s. %s. " \ "Safe mode set so not deleting" % (r['a.annotation_type'], r['a.text'])) c = get_con(sys.argv[1], sys.argv[2]) b = Brain() b.learn(sys.argv[3]) # Path to ontology file with referenced terms (presumably fbbt_simple will suffice) od = owlDbOnt(conn = c, ont = b) update_table = tab("../../../doc/", "annotation_map.tsv") update_akv_from_tab(od, update_table) # Assumes update table has all mappings. If it lacks any, assumes these mappings are to be deleted! This is potentially dangerous if mapping table is out of sync with DB. outfile = open("../../../doc/annotation_map_report.tsv", "w") report_tab = gen_report_tab(od) outfile.write(report_tab.print_tab(sort_keys = ('a.annotation_type', 'a.text'))) outfile.close() c.commit() c.close() b.sleep()
import glob from tsv2pdm import tab results_files = glob.glob("*RCV_*.tsv") results_files.append("results_template.tsv") for r in results_files: t = tab("", r) t.append_column("is_obsolete", 0) f = open(r, "w") f.write(t.print_tab()) f.close()
from tsv2pdm import tab, rcd from glob import glob1 import re from numpy import average, median, sum, round from collections import Counter # General comment - this would be so much easier to do with a DB! # TODO - hook, directly or indirectly, into the ticket system. Could pull from owl_map. results = glob1("../mapping_tables/results/", "*_RCV_*.tsv") stats = tab(key_column="RCV_ID", headers=[ 'RCV_ID', 'RCV_name', 'Auto sufficient', 'Manual only', 'Auto only', 'Manual blacklist', 'Auto blacklist', 'pattern' ]) # Should really load as rcd to enforce key column uniqueness owl_map = rcd(path="../mapping_tables/", file_name="owl_map.tsv", key_column='RCV_ID') total_sufficient_maps = 0 # Lists for doing basic statistical analysis of results # Sure this could be done more elegantly with list comps on tab, but still... Auto_sufficient = [] Manual_only = [] Auto_only = []
# Number of classes where auto mapping is sufficient from tsv2pdm import tab, rcd from glob import glob1 import re from numpy import average, median, sum, round from collections import Counter # General comment - this would be so much easier to do with a DB! # TODO - hook, directly or indirectly, into the ticket system. Could pull from owl_map. results = glob1("../mapping_tables/results/", "*_RCV_*.tsv") stats = tab(key_column = "RCV_ID", headers = ['RCV_ID', 'RCV_name', 'Auto sufficient', 'Manual only', 'Auto only', 'Manual blacklist', 'Auto blacklist', 'pattern']) # Should really load as rcd to enforce key column uniqueness owl_map = rcd(path = "../mapping_tables/", file_name = "owl_map.tsv", key_column = 'RCV_ID' ) total_sufficient_maps = 0 # Lists for doing basic statistical analysis of results # Sure this could be done more elegantly with list comps on tab, but still... Auto_sufficient = [] Manual_only = [] Auto_only = [] Auto_blacklist = [] Manual_blacklist = []
# Need to work on balance between the generating script and the module # TODO: Add code to generate full mapping table. This can be derived from results tables + ticket info without a further reasoner run. ## Spec: Include combined manual & auto mappings that are not blacklists from results files for while a ticket exists with label: mapping_complete. # Rather scrappy, Perlish procedural code for generating mappings. Annoyingly monolithic: Have to run all mappings or none. """Reads owl_map and uses it to automatically populate RCV classes. Compares these to manual mappings. Prints a results summary and results tables. Ontology to use must be specified as argv[1] when runnning this script.""" from mapping_tools import (map_obj, load_ont, mappingTabs) from tsv2pdm import tab, rcd go = load_ont(sys.argv[1]) manMap = tab('../mapping_tables/', 'manual_mapping.tsv') # No key row. Stored as list of dicts. owlMap = rcd('../mapping_tables/', 'owl_map.tsv', 'RCV_ID') # dict of dicts. RCV = rcd('../mapping_tables/', 'RocheCV_def.tsv', 'RCV_ID') # dict of dicts. mapping_tabs = mappingTabs(manMap.tab, owlMap.rowColDict, RCV.rowColDict, go) # ...Hmmm - would give much more flexibility if passed objects rather than data structures. manMap_updated = open('../mapping_tables/manual_mapping.tsv', "w") manMap_updated.write(manMap.print_tab(sort_keys=('RCV_ID', ))) manMap_updated.close() RCV_id_name = {} # Residual perlishness ? for row in manMap.tab: RCV_id_name[row['RCV_ID']] = row['RCV_NAME'] report_path = '../mapping_tables/results/'