def gen_simi_search_sql(self, smiles, min_simi): ''' gen_simi_search_sql(smiles, min_simi) -- Returns a string. args: smiles: str, smiles that represents a molecule. min_simi: float, the minimal similarity value (tanimoto coefficient) tanimoto coefficient A is number of ones in fps of mol1 B is number of ones in fps of mol2 C is number of ones in A & B similarity = C/(A+B-C) returns a partial sql string like ' ,tanimoto as similarity FROM ... WHERE ... ' where 'tanimoto' is the sql string for calculating tanimoto value and 'similarity' is the simi_field from env['SIMI_FIELD'] ''' s = '' s_c = '' s_b = '' ones_a = 0 stat = {} fps = {} min_simi = float(min_simi) mol = mymol('smi', smiles) mol.get_mol_stat(stat) mol.get_fps(fps) record_keys = [ "NUM_C", "NUM_O", "NUM_N", "NUM_P", "NUM_S", "NUM_F", "NUM_Cl", "NUM_Br", "NUM_I" ] for i in record_keys: if stat.has_key(i) and self.recorded_fields_dict.has_key(i): s += self.recorded_fields_dict[i] + " >= " + \ str(( lambda x: x == int(x) and int(x) or int(x) + 1 )(min_simi * stat[i])) + \ " AND " + self.recorded_fields_dict[i] + " <= " + str(int(1 / min_simi * stat[i])) + ' AND ' fps_keys = fps.keys() fps_keys.remove(self.env['FP_BITS_KEY']) fps_keys.sort() for i in fps_keys: if self.recorded_fields_dict.has_key(i): s_b += 'BIT_COUNT(%s) + ' % (self.recorded_fields_dict[i], ) if fps[i] != 0: s_c += 'BIT_COUNT(%s & %s) + ' % ( self.recorded_fields_dict[i], fps[i]) #ones_a += str(bin(fps[i])).count('1') s_b = (lambda x: x and '( ' + x + ' )' or '')(s_b.rstrip('+ ')) s_c = (lambda x: x and '( ' + x + ' )' or '')(s_c.rstrip('+ ')) ones_a = fps[self.env['FP_BITS_KEY']] tanimoto = '%s / ( %s + %s - %s )' % (s_c, str(ones_a), s_b, s_c) join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env['PRI_FIELD']) return ', %s as %s FROM %s WHERE ( (%s) AND (%s > %s) )' % ( tanimoto, self.env['SIMI_FIELD'], join_part_sql, s.rstrip(' AND '), tanimoto, str(min_simi))
def gen_adv_search_sql( self, query, smiles_dic={}, abbr_dic={}, black_words=['select', 'insert', 'update', 'drop', 'source']): ''' gen_adv_search_sql(query[, smiles_dic[, abbr_dic[, black_words]]]) -- Returns a string. args: query: str, the query string. smiles_dic: dict, contains the molecules which may be used in the query. abbr_dic: dict, the dict of the abbreviated key words. black_words: list, the list of the dangerous words. returns a partial sql string like ' FROM ... WHERE ... ' ''' stats = [] s = '' sub_mols = [] sup_mols = [] def gen_re_string(string): s = '' for i in str(string): s += '[' + i + i.upper() + ']' return s # check if query contains black words for i in black_words: if re.findall(gen_re_string(i), query): raise Exception, 'contains illegal words!' # upper case the sql keywords 'and', 'or' query = re.sub(r'([aA][nN][dD])|([oO][rR])', lambda g: g.group(0).upper(), query) + ' ' # add space before and after >, <, =, >=, etc. query = re.sub(r' *[><!=]+ *', lambda g: ' ' + g.group(0) + ' ', query) # unabbreviating r0 = re.compile(r'\w+') def repl0(g): if abbr_dic.has_key(g.group(0)): return abbr_dic[g.group(0)] else: return g.group(0) query = r0.sub(repl0, query) # RE for extracting the sub (substructure) part re_sub = re.compile(r'[sS][uU][bB] *[!=]+ *[^ )(]*(?=[ )(]+)') def repl_sub(g, gen_sql=self.gen_search_sql_tuple): sep = '' sql = '' smi = '' g = g.group(0) g = g.replace(' ', '') if re.findall(r'!=', g): sep = '!=' type = '3' elif re.findall(r'=', g): sep = '=' type = '2' mol_key = g.split(sep)[-1].strip(' ') if sep and smiles_dic.has_key(mol_key): smi = smiles_dic[mol_key] sql = gen_sql(str(smi), str(type), 1) stats.append(sql[0]) return sql[1] # RE for extracting the sup (superstructure) part re_sup = re.compile(r'[sS][uU][pP] *[!=]+ *[^ )(]*(?=[ )(]+)') def repl_sup(g, gen_sql=self.gen_search_sql_tuple): sep = '' sql = '' smi = '' g = g.group(0) g = g.replace(' ', '') if re.findall(r'!=', g): sep = '!=' type = '5' elif re.findall(r'=', g): sep = '=' type = '4' mol_key = g.split(sep)[-1].strip(' ') if sep and smiles_dic.has_key(mol_key): smi = smiles_dic[mol_key] sql = gen_sql(str(smi), str(type), 1) stats.append(sql[0]) return sql[1] # get the sub_mols for m in re_sub.findall(query): if re.findall(r'[^!]=', m): mol_key = m.split('=')[-1].strip(' ') if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key): sub_mols.append(smiles_dic.get(mol_key)) # get the sup_mols for m in re_sup.findall(query): if re.findall(r'[^!]=', m): mol_key = m.split('=')[-1].strip(' ') if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key): sup_mols.append(smiles_dic.get(mol_key)) # if the query has both sub_mols and sup_mols, then, the sub_mols must be # the substructure of sup_mols if sup_mols: for m in sub_mols: mymol_obj = mymol('smi', m) for m0 in sup_mols: if not mymol_obj.sub_match('smi', m0): raise Exception, 'sub and sup not match' result = re_sub.sub(repl_sub, query) result = re_sup.sub(repl_sup, result) tmp_dic_sub = {} tmp_dic_sup = {} stats = [j for j in stats if j] if stats: for i in stats: if 'sub' in i.keys(): if not tmp_dic_sub: tmp_dic_sub = i['sub'] continue else: for k, v in i['sub'].items(): if tmp_dic_sub.has_key(k): if tmp_dic_sub[k] < v: tmp_dic_sub[k] = v else: tmp_dic_sub[k] = v elif 'sup' in i.keys(): if not tmp_dic_sup: tmp_dic_sup = i['sup'] continue else: for k, v in i['sup'].items(): if tmp_dic_sup.has_key(k): if tmp_dic_sup[k] < v: tmp_dic_sup[k] = v else: tmp_dic_sup[k] = v val_k = '' for k, v in tmp_dic_sub.items(): for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: if tmp_dic_sup.has_key(k): if int(v) == int(tmp_dic_sup[k]): s += '%s = %s AND ' % (str(val_k), str(v)) elif int(v) > int(tmp_dic_sup[k]): raise Exception, 'mol stat range err' else: s += '%s >= %s AND %s <= %s AND ' % ( str(val_k), str(v), str(val_k), str(tmp_dic_sup[k])) else: s += str(val_k) + ' >= ' + str(v) + ' AND ' for k, v in tmp_dic_sup.items(): if k in tmp_dic_sub.keys(): continue else: for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: s += str(val_k) + ' <= ' + str(v) + ' AND ' result = '(%s) AND (%s)' % (s.rstrip(' AND '), result.strip(' ')) join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env['PRI_FIELD']) return ' FROM %s WHERE (%s)' % (join_part_sql, result)
def gen_search_sql_tuple(self, smiles, search_type="1", out_put_type=1): ''' gen_search_sql_tuple(smiles[, search_type[, out_put_type]]) -- Returns a tuple args: smiles: str, smiles that represents a molecule. search_type: str. values: '1': full march, '2': target structure contains substructure, '3': target structure dosen't contains substructure '4': target structure contains superstructure, '5': target structure dosen't contains superstructure out_put_type: int. values: 1 or not 2: without the mol_stat part. 2: contains the mol_stat part generates the search sql string according to the smiles string. returns a tuple contains two elements: 0: stat, some statistic of the mol such as number of C, number of rings and etc. 1: the sql part, sql not fully generated, only a part like this " FROM ... WHERE ... " ''' fp_sql = "" stat = {} head_flag = '(' end_flag = ')' stat_part = '' md5_ob_can_smi_key = self.env['MD5_OPENBABEL_CAN_SMI_KEY'] fp_bits_key = self.env['FP_BITS_KEY'] sign_mol_stat = ' >= ' try: num_H_key = self.env['NUM_H_KEY'] except: num_H_key = 'NUM_H' bool_op = " AND " mol_fp = {} mol = mymol('smi', smiles) mol.get_fps(mol_fp) if search_type == "1": if md5_ob_can_smi_key in self.recorded_fields_dict.keys(): return ({}, self.recorded_fields_dict[md5_ob_can_smi_key] + " = '" + md5(mymol('smi', smiles).gen_openbabel_can_smiles()) + "'") else: return ({}, '') elif search_type in ("2", "4"): mol_stat = {} tmp_stat = {} mol.get_mol_stat(mol_stat) if mol_stat.has_key(num_H_key): mol_stat.pop(num_H_key) tmp_stat[str(fp_bits_key)] = mol_fp[fp_bits_key] tmp_stat.update(mol_stat) if search_type == '2': sign_mol_stat = ' >= ' stat['sub'] = tmp_stat elif search_type == '4': sign_mol_stat = ' <= ' stat['sup'] = tmp_stat bool_op = ' AND ' elif search_type in ("3", "5"): bool_op = ' AND ' head_flag = ' NOT (' fps_keys = mol_fp.items() fps_keys.sort() for i, j in fps_keys: if self.recorded_fields_dict.has_key( i) and i != fp_bits_key and j != 0: if search_type in ('2', '3'): fp_sql += self.recorded_fields_dict[i] + " & " + str( j) + ' = ' + str(j) + bool_op elif search_type in ('4', '5'): field = self.recorded_fields_dict[i] fp_sql += field + " & " + str(j) + ' = ' + field + bool_op if out_put_type == 2 and search_type in ('2', '4'): val_k = [] for k, v in tmp_stat.items(): for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: stat_part += str(val_k) + sign_mol_stat + str(v) + ' AND ' if stat_part: head_flag = '(' + stat_part.rstrip('AND ') + ') AND (' if fp_sql.rstrip(bool_op): return (stat, head_flag + fp_sql.rstrip(bool_op) + end_flag) else: return ({}, '1')
def gen_adv_search_sql( self, query, smiles_dic={}, abbr_dic={}, black_words=["select", "insert", "update", "drop", "source"] ): """ gen_adv_search_sql(query[, smiles_dic[, abbr_dic[, black_words]]]) -- Returns a string. args: query: str, the query string. smiles_dic: dict, contains the molecules which may be used in the query. abbr_dic: dict, the dict of the abbreviated key words. black_words: list, the list of the dangerous words. returns a partial sql string like ' FROM ... WHERE ... ' """ stats = [] s = "" sub_mols = [] sup_mols = [] def gen_re_string(string): s = "" for i in str(string): s += "[" + i + i.upper() + "]" return s # check if query contains black words for i in black_words: if re.findall(gen_re_string(i), query): raise Exception, "contains illegal words!" # upper case the sql keywords 'and', 'or' query = re.sub(r"([aA][nN][dD])|([oO][rR])", lambda g: g.group(0).upper(), query) + " " # add space before and after >, <, =, >=, etc. query = re.sub(r" *[><!=]+ *", lambda g: " " + g.group(0) + " ", query) # unabbreviating r0 = re.compile(r"\w+") def repl0(g): if abbr_dic.has_key(g.group(0)): return abbr_dic[g.group(0)] else: return g.group(0) query = r0.sub(repl0, query) # RE for extracting the sub (substructure) part re_sub = re.compile(r"[sS][uU][bB] *[!=]+ *[^ )(]*(?=[ )(]+)") def repl_sub(g, gen_sql=self.gen_search_sql_tuple): sep = "" sql = "" smi = "" g = g.group(0) g = g.replace(" ", "") if re.findall(r"!=", g): sep = "!=" type = "3" elif re.findall(r"=", g): sep = "=" type = "2" mol_key = g.split(sep)[-1].strip(" ") if sep and smiles_dic.has_key(mol_key): smi = smiles_dic[mol_key] sql = gen_sql(str(smi), str(type), 1) stats.append(sql[0]) return sql[1] # RE for extracting the sup (superstructure) part re_sup = re.compile(r"[sS][uU][pP] *[!=]+ *[^ )(]*(?=[ )(]+)") def repl_sup(g, gen_sql=self.gen_search_sql_tuple): sep = "" sql = "" smi = "" g = g.group(0) g = g.replace(" ", "") if re.findall(r"!=", g): sep = "!=" type = "5" elif re.findall(r"=", g): sep = "=" type = "4" mol_key = g.split(sep)[-1].strip(" ") if sep and smiles_dic.has_key(mol_key): smi = smiles_dic[mol_key] sql = gen_sql(str(smi), str(type), 1) stats.append(sql[0]) return sql[1] # get the sub_mols for m in re_sub.findall(query): if re.findall(r"[^!]=", m): mol_key = m.split("=")[-1].strip(" ") if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key): sub_mols.append(smiles_dic.get(mol_key)) # get the sup_mols for m in re_sup.findall(query): if re.findall(r"[^!]=", m): mol_key = m.split("=")[-1].strip(" ") if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key): sup_mols.append(smiles_dic.get(mol_key)) # if the query has both sub_mols and sup_mols, then, the sub_mols must be # the substructure of sup_mols if sup_mols: for m in sub_mols: mymol_obj = mymol("smi", m) for m0 in sup_mols: if not mymol_obj.sub_match("smi", m0): raise Exception, "sub and sup not match" result = re_sub.sub(repl_sub, query) result = re_sup.sub(repl_sup, result) tmp_dic_sub = {} tmp_dic_sup = {} stats = [j for j in stats if j] if stats: for i in stats: if "sub" in i.keys(): if not tmp_dic_sub: tmp_dic_sub = i["sub"] continue else: for k, v in i["sub"].items(): if tmp_dic_sub.has_key(k): if tmp_dic_sub[k] < v: tmp_dic_sub[k] = v else: tmp_dic_sub[k] = v elif "sup" in i.keys(): if not tmp_dic_sup: tmp_dic_sup = i["sup"] continue else: for k, v in i["sup"].items(): if tmp_dic_sup.has_key(k): if tmp_dic_sup[k] < v: tmp_dic_sup[k] = v else: tmp_dic_sup[k] = v val_k = "" for k, v in tmp_dic_sub.items(): for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: if tmp_dic_sup.has_key(k): if int(v) == int(tmp_dic_sup[k]): s += "%s = %s AND " % (str(val_k), str(v)) elif int(v) > int(tmp_dic_sup[k]): raise Exception, "mol stat range err" else: s += "%s >= %s AND %s <= %s AND " % (str(val_k), str(v), str(val_k), str(tmp_dic_sup[k])) else: s += str(val_k) + " >= " + str(v) + " AND " for k, v in tmp_dic_sup.items(): if k in tmp_dic_sub.keys(): continue else: for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: s += str(val_k) + " <= " + str(v) + " AND " result = "(%s) AND (%s)" % (s.rstrip(" AND "), result.strip(" ")) join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env["PRI_FIELD"]) return " FROM %s WHERE (%s)" % (join_part_sql, result)
def gen_simi_search_sql(self, smiles, min_simi): """ gen_simi_search_sql(smiles, min_simi) -- Returns a string. args: smiles: str, smiles that represents a molecule. min_simi: float, the minimal similarity value (tanimoto coefficient) tanimoto coefficient A is number of ones in fps of mol1 B is number of ones in fps of mol2 C is number of ones in A & B similarity = C/(A+B-C) returns a partial sql string like ' ,tanimoto as similarity FROM ... WHERE ... ' where 'tanimoto' is the sql string for calculating tanimoto value and 'similarity' is the simi_field from env['SIMI_FIELD'] """ s = "" s_c = "" s_b = "" ones_a = 0 stat = {} fps = {} min_simi = float(min_simi) mol = mymol("smi", smiles) mol.get_mol_stat(stat) mol.get_fps(fps) record_keys = ["NUM_C", "NUM_O", "NUM_N", "NUM_P", "NUM_S", "NUM_F", "NUM_Cl", "NUM_Br", "NUM_I"] for i in record_keys: if stat.has_key(i) and self.recorded_fields_dict.has_key(i): s += ( self.recorded_fields_dict[i] + " >= " + str((lambda x: x == int(x) and int(x) or int(x) + 1)(min_simi * stat[i])) + " AND " + self.recorded_fields_dict[i] + " <= " + str(int(1 / min_simi * stat[i])) + " AND " ) fps_keys = fps.keys() fps_keys.remove(self.env["FP_BITS_KEY"]) fps_keys.sort() for i in fps_keys: if self.recorded_fields_dict.has_key(i): s_b += "BIT_COUNT(%s) + " % (self.recorded_fields_dict[i],) if fps[i] != 0: s_c += "BIT_COUNT(%s & %s) + " % (self.recorded_fields_dict[i], fps[i]) # ones_a += str(bin(fps[i])).count('1') s_b = (lambda x: x and "( " + x + " )" or "")(s_b.rstrip("+ ")) s_c = (lambda x: x and "( " + x + " )" or "")(s_c.rstrip("+ ")) ones_a = fps[self.env["FP_BITS_KEY"]] tanimoto = "%s / ( %s + %s - %s )" % (s_c, str(ones_a), s_b, s_c) join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env["PRI_FIELD"]) return ", %s as %s FROM %s WHERE ( (%s) AND (%s > %s) )" % ( tanimoto, self.env["SIMI_FIELD"], join_part_sql, s.rstrip(" AND "), tanimoto, str(min_simi), )
def gen_search_sql_tuple(self, smiles, search_type="1", out_put_type=1): """ gen_search_sql_tuple(smiles[, search_type[, out_put_type]]) -- Returns a tuple args: smiles: str, smiles that represents a molecule. search_type: str. values: '1': full march, '2': target structure contains substructure, '3': target structure dosen't contains substructure '4': target structure contains superstructure, '5': target structure dosen't contains superstructure out_put_type: int. values: 1 or not 2: without the mol_stat part. 2: contains the mol_stat part generates the search sql string according to the smiles string. returns a tuple contains two elements: 0: stat, some statistic of the mol such as number of C, number of rings and etc. 1: the sql part, sql not fully generated, only a part like this " FROM ... WHERE ... " """ fp_sql = "" stat = {} head_flag = "(" end_flag = ")" stat_part = "" md5_ob_can_smi_key = self.env["MD5_OPENBABEL_CAN_SMI_KEY"] fp_bits_key = self.env["FP_BITS_KEY"] sign_mol_stat = " >= " try: num_H_key = self.env["NUM_H_KEY"] except: num_H_key = "NUM_H" bool_op = " AND " mol_fp = {} mol = mymol("smi", smiles) mol.get_fps(mol_fp) if search_type == "1": if md5_ob_can_smi_key in self.recorded_fields_dict.keys(): return ( {}, self.recorded_fields_dict[md5_ob_can_smi_key] + " = '" + md5(mymol("smi", smiles).gen_openbabel_can_smiles()) + "'", ) else: return ({}, "") elif search_type in ("2", "4"): mol_stat = {} tmp_stat = {} mol.get_mol_stat(mol_stat) if mol_stat.has_key(num_H_key): mol_stat.pop(num_H_key) tmp_stat[str(fp_bits_key)] = mol_fp[fp_bits_key] tmp_stat.update(mol_stat) if search_type == "2": sign_mol_stat = " >= " stat["sub"] = tmp_stat elif search_type == "4": sign_mol_stat = " <= " stat["sup"] = tmp_stat bool_op = " AND " elif search_type in ("3", "5"): bool_op = " AND " head_flag = " NOT (" fps_keys = mol_fp.items() fps_keys.sort() for i, j in fps_keys: if self.recorded_fields_dict.has_key(i) and i != fp_bits_key and j != 0: if search_type in ("2", "3"): fp_sql += self.recorded_fields_dict[i] + " & " + str(j) + " = " + str(j) + bool_op elif search_type in ("4", "5"): field = self.recorded_fields_dict[i] fp_sql += field + " & " + str(j) + " = " + field + bool_op if out_put_type == 2 and search_type in ("2", "4"): val_k = [] for k, v in tmp_stat.items(): for j in self.def_dict.values(): if j.has_key(k): val_k = j[k][0] break if val_k: stat_part += str(val_k) + sign_mol_stat + str(v) + " AND " if stat_part: head_flag = "(" + stat_part.rstrip("AND ") + ") AND (" if fp_sql.rstrip(bool_op): return (stat, head_flag + fp_sql.rstrip(bool_op) + end_flag) else: return ({}, "1")