def simgic_hindex(id1, id2, h=4): cur = db.cursor() cur.execute( """SELECT ( SELECT SUM(y.rel_info) FROM ( SELECT DISTINCT f.term_id, f.rel_info FROM graph_path p1, graph_path p2, SSM_TermDesc f WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id=p2.term1_id AND p1.term1_id=f.term_id AND f.hindex >= %s) AS y ) /( SELECT SUM(x.rel_info) FROM ( SELECT f1.term_id, f1.rel_info FROM graph_path p1, SSM_TermDesc f1 WHERE p1.term2_id = %s AND p1.term1_id = f1.term_id AND f1.hindex >= %s UNION SELECT f2.term_id, f2.rel_info FROM graph_path p2, SSM_TermDesc f2 WHERE p2.term2_id = %s AND p2.term1_id = f2.term_id AND f2.hindex >= %s) AS x )""", (id1, id2, h, id1, h, id2, h)) res = cur.fetchone()[0] if res is None: res = '0' return float(res)
def simui(id1, id2): cur = db.cursor() cur.execute("""SELECT ( SELECT COUNT(y.rel_info) FROM ( SELECT DISTINCT f.term_id, f.rel_info FROM graph_path p1, graph_path p2, SSM_TermDesc f WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id=p2.term1_id AND p1.term1_id=f.term_id) AS y ) /( SELECT COUNT(x.rel_info) FROM ( SELECT f1.term_id, f1.rel_info FROM graph_path p1, SSM_TermDesc f1 WHERE p1.term2_id = %s AND p1.term1_id = f1.term_id UNION SELECT f2.term_id, f2.rel_info FROM graph_path p2, SSM_TermDesc f2 WHERE p2.term2_id = %s AND p2.term1_id = f2.term_id) AS x )""", (id1,id2,id1,id2)) res = cur.fetchone()[0] if res is None: res = '0' return float(res)
def simui(id1, id2): cur = db.cursor() cur.execute( """SELECT ( SELECT COUNT(y.rel_info) FROM ( SELECT DISTINCT f.term_id, f.rel_info FROM graph_path p1, graph_path p2, SSM_TermDesc f WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id=p2.term1_id AND p1.term1_id=f.term_id) AS y ) /( SELECT COUNT(x.rel_info) FROM ( SELECT f1.term_id, f1.rel_info FROM graph_path p1, SSM_TermDesc f1 WHERE p1.term2_id = %s AND p1.term1_id = f1.term_id UNION SELECT f2.term_id, f2.rel_info FROM graph_path p2, SSM_TermDesc f2 WHERE p2.term2_id = %s AND p2.term1_id = f2.term_id) AS x )""", (id1, id2, id1, id2)) res = cur.fetchone()[0] if res is None: res = '0' return float(res)
def harmonicmeanIC(id1, id2): if id1 == 0 or id2 == 0: return 0 cur = db.cursor() cur.execute("""SELECT f.rel_info FROM SSM_TermDesc f WHERE f.term_id = %s""", (id1,)) ic1 = float(cur.fetchone()[0]) cur = db.cursor() cur.execute("""SELECT f.rel_info FROM SSM_TermDesc f WHERE f.term_id = %s""", (id2,)) ic2 = float(cur.fetchone()[0]) if ic2 == 0 or ic2 == 0: return 0 return (2*ic1*ic2)/(ic1+ic2)
def simgic_hindex(id1, id2, h=4): cur = db.cursor() cur.execute("""SELECT ( SELECT SUM(y.rel_info) FROM ( SELECT DISTINCT f.term_id, f.rel_info FROM graph_path p1, graph_path p2, SSM_TermDesc f WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id=p2.term1_id AND p1.term1_id=f.term_id AND f.hindex >= %s) AS y ) /( SELECT SUM(x.rel_info) FROM ( SELECT f1.term_id, f1.rel_info FROM graph_path p1, SSM_TermDesc f1 WHERE p1.term2_id = %s AND p1.term1_id = f1.term_id AND f1.hindex >= %s UNION SELECT f2.term_id, f2.rel_info FROM graph_path p2, SSM_TermDesc f2 WHERE p2.term2_id = %s AND p2.term1_id = f2.term_id AND f2.hindex >= %s) AS x )""", (id1,id2,h,id1,h,id2,h)) res = cur.fetchone()[0] if res is None: res = '0' return float(res)
def get_IC(): cur = db.cursor() # check for exact match query = """SELECT distinct term_id, rel_info, hindex_info, seco_info FROM SSM_TermDesc""" # print "QUERY", query cur.execute(query) res = cur.fetchall() return zip(*res)
def check_dist_between(cid1, cid2): cur = db.cursor() query = """SELECT distance FROM graph_path WHERE term1_id = %s and term2_id = %s""" % (cid1, cid2) cur.execute(query) res = cur.fetchone() if res is None: dist = -1 else: dist = int(res[0]) return dist
def get_description(id): cur = db.cursor() query = """SELECT term_definition FROM term_definition WHERE term_id = %s""" % id cur.execute(query) res = cur.fetchone() if res is not None: return res[0] else: return "NA"
def termsInCommon(id1, id2): cur = db.cursor() cur.execute("""SELECT COUNT(i.rel_info) FROM graph_path p1, graph_path p2, SSM_TermDesc i WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id = p2.term1_id AND p1.term1_id = i.term_id;""", (id1, id2)) res = cur.fetchone()[0] if res is None: res = '0' return int(res)
def resnik(id1, id2): cur = db.cursor() cur.execute("""SELECT MAX(i.rel_info) FROM graph_path p1, graph_path p2, SSM_TermDesc i WHERE p1.term2_id = %s AND p2.term2_id = %s AND p1.term1_id = p2.term1_id AND p1.term1_id = i.term_id;""", (id1, id2)) #r = cur.store_result() res = cur.fetchone()[0] #print id1, id2, res if res is None: res = '0' return float(res)
def load_synonyms(): syns = [] cur = db.cursor() query = """SELECT id, name FROM term """ cur.execute(query) ids = cur.fetchall() for i in ids: print "getting synonyms for" + i[1].lower() + '(' + str(i[0]) + ')', synset = set() synset.add(i[1].lower()) query = """SELECT term_synonym FROM term_synonym WHERE term_id = %s""" % i[0] cur.execute(query) names = cur.fetchall() print len(names) for name in names: #print name[0], synset.add(name[0].lower()) syns.append(synset) pickle.dump(syns, open("data/chebi_synonyms.pickle", 'wb')) print "done"
def find_chebi_term(term, adjust=0): ''' returns tuple (chebiID, chebiTerm, score) if resolution fails, return ('0', 'null', 0.0) ''' # print "TERM", term term = MySQLdb.escape_string(term) # adjust - adjust the final score match = () cur = db.cursor() # check for exact match query = """SELECT distinct id, name FROM term a WHERE name =%s and LENGTH(a.name)>0 and star=3;""" # print "QUERY", query cur.execute(query, (term,)) res = cur.fetchone() if res is not None: # print "1" score = 1.0 + adjust match = (str(res[0]), res[1], score) else: # synonyms cur.execute("""SELECT a.term_id, a.term_synonym, b.name FROM term_synonym a, term b WHERE a.term_synonym=%s and b.id=a.term_id and LENGTH(a.term_synonym)>0 and star=3;""", (term,)) res = cur.fetchone() if res is not None: # print "2" score = 0.8 + adjust match = (str(res[0]), res[2], score) else: # plural - tb pode ser recursivo if len(term) > 0 and term[-1] == 's': match = find_chebi_term(term[:-1], -0.1) # if not match: #(1)H -> hydrogen-1 terms = re.sub(r'[\(|\)|\[|\]| ]', ' ', term) termlist = terms.strip().split(" ") if len(termlist) == 2: if termlist[0].isdigit() and termlist[1] in element_base: match = find_chebi_term(element_base[termlist[1]][0] + "-" + termlist[0], -0.1) if termlist[1] == '+' and termlist[0] in element_base: match = find_chebi_term(element_base[termlist[0]][0] + ' cation', -0.1) if termlist[1] == '-' and termlist[0] in element_base: match = find_chebi_term(element_base[termlist[0]][0] + ' anion', -0.1) # if match != '': # print term, match if not match: # partial match terms = '("' + '","'.join(term.split(" ")) + '")' query = """ SELECT ((sum(d.ic)/ec)-0.1) as score, e.name, c.term_id, c.id, group_concat(d.word separator ','), count(d.id), c.descriptor_type FROM term e JOIN descriptor3 c ON(c.term_id=e.id) JOIN word2term3 b ON (b.descriptor_id=c.id) JOIN word3 d ON (d.id=b.word_id) JOIN SSM_TermDesc f ON (e.id=f.term_id) WHERE b.word_id IN (SELECT distinct id FROM word3 WHERE word in %s) GROUP by c.id ORDER by score desc LIMIT 3;""" % (terms,) # print "QUERY3", query, adjust cur.execute(query) res = cur.fetchone() if res is not None: # print "3" match = (str(res[3]), res[1], float(res[0])) # print term, match if not match or match[2] < 0.0: match = ('0', 'null', 0.0) return match
def find_chebi_term(term, adjust=0): ''' returns tuple (chebiID, chebiTerm, score) if resolution fails, return ('0', 'null', 0.0) ''' # print "TERM", term term = MySQLdb.escape_string(term) # adjust - adjust the final score match = () cur = db.cursor() # check for exact match query = """SELECT distinct id, name FROM term a WHERE name =%s and LENGTH(a.name)>0 and star=3;""" # print "QUERY", query cur.execute(query, (term, )) res = cur.fetchone() if res is not None: # print "1" score = 1.0 + adjust match = (str(res[0]), res[1], score) else: # synonyms cur.execute( """SELECT a.term_id, a.term_synonym, b.name FROM term_synonym a, term b WHERE a.term_synonym=%s and b.id=a.term_id and LENGTH(a.term_synonym)>0 and star=3;""", (term, )) res = cur.fetchone() if res is not None: # print "2" score = 0.8 + adjust match = (str(res[0]), res[2], score) else: # plural - tb pode ser recursivo if len(term) > 0 and term[-1] == 's': match = find_chebi_term(term[:-1], -0.1) # if not match: #(1)H -> hydrogen-1 terms = re.sub(r'[\(|\)|\[|\]| ]', ' ', term) termlist = terms.strip().split(" ") if len(termlist) == 2: if termlist[0].isdigit() and termlist[1] in element_base: match = find_chebi_term( element_base[termlist[1]][0] + "-" + termlist[0], -0.1) if termlist[1] == '+' and termlist[0] in element_base: match = find_chebi_term( element_base[termlist[0]][0] + ' cation', -0.1) if termlist[1] == '-' and termlist[0] in element_base: match = find_chebi_term( element_base[termlist[0]][0] + ' anion', -0.1) # if match != '': # print term, match if not match: # partial match terms = '("' + '","'.join(term.split(" ")) + '")' query = """ SELECT ((sum(d.ic)/ec)-0.1) as score, e.name, c.term_id, c.id, group_concat(d.word separator ','), count(d.id), c.descriptor_type FROM term e JOIN descriptor3 c ON(c.term_id=e.id) JOIN word2term3 b ON (b.descriptor_id=c.id) JOIN word3 d ON (d.id=b.word_id) JOIN SSM_TermDesc f ON (e.id=f.term_id) WHERE b.word_id IN (SELECT distinct id FROM word3 WHERE word in %s) GROUP by c.id ORDER by score desc LIMIT 3;""" % (terms, ) # print "QUERY3", query, adjust cur.execute(query) res = cur.fetchone() if res is not None: # print "3" match = (str(res[3]), res[1], float(res[0])) # print term, match if not match or match[2] < 0.0: match = ('0', 'null', 0.0) return match