Пример #1
0
    def gen_simi_search_sql(self, smiles, min_simi):
        '''
        gen_simi_search_sql(smiles, min_simi) -- Returns a string.

        args:
            smiles: str, smiles that represents a molecule.
            min_simi: float, the minimal similarity value (tanimoto coefficient)

        tanimoto coefficient
        A is number of ones in fps of mol1
        B is number of ones in fps of mol2
        C is number of ones in A & B
        similarity = C/(A+B-C)

        returns a partial sql string like ' ,tanimoto as similarity FROM ... WHERE ... '
        where 'tanimoto' is the sql string for calculating tanimoto value and 'similarity'
        is the simi_field from env['SIMI_FIELD']
        '''
        s = ''
        s_c = ''
        s_b = ''
        ones_a = 0
        stat = {}
        fps = {}
        min_simi = float(min_simi)
        mol = mymol('smi', smiles)
        mol.get_mol_stat(stat)
        mol.get_fps(fps)
        record_keys = [
            "NUM_C", "NUM_O", "NUM_N", "NUM_P", "NUM_S", "NUM_F", "NUM_Cl",
            "NUM_Br", "NUM_I"
        ]
        for i in record_keys:
            if stat.has_key(i) and self.recorded_fields_dict.has_key(i):
                s += self.recorded_fields_dict[i] + " >= " + \
                        str(( lambda x: x == int(x) and int(x) or int(x) + 1 )(min_simi * stat[i])) + \
                        " AND " + self.recorded_fields_dict[i] + " <= " + str(int(1 / min_simi * stat[i])) + ' AND '
        fps_keys = fps.keys()
        fps_keys.remove(self.env['FP_BITS_KEY'])
        fps_keys.sort()
        for i in fps_keys:
            if self.recorded_fields_dict.has_key(i):
                s_b += 'BIT_COUNT(%s) + ' % (self.recorded_fields_dict[i], )
                if fps[i] != 0:
                    s_c += 'BIT_COUNT(%s & %s) + ' % (
                        self.recorded_fields_dict[i], fps[i])
                    #ones_a += str(bin(fps[i])).count('1')
        s_b = (lambda x: x and '( ' + x + ' )' or '')(s_b.rstrip('+ '))
        s_c = (lambda x: x and '( ' + x + ' )' or '')(s_c.rstrip('+ '))
        ones_a = fps[self.env['FP_BITS_KEY']]

        tanimoto = '%s / ( %s + %s - %s )' % (s_c, str(ones_a), s_b, s_c)
        join_part_sql = self.gen_join_part_sql(self.dbd.tables,
                                               self.env['PRI_FIELD'])
        return ', %s as %s FROM %s WHERE ( (%s) AND (%s > %s) )' % (
            tanimoto, self.env['SIMI_FIELD'], join_part_sql, s.rstrip(' AND '),
            tanimoto, str(min_simi))
Пример #2
0
    def gen_adv_search_sql(
            self,
            query,
            smiles_dic={},
            abbr_dic={},
            black_words=['select', 'insert', 'update', 'drop', 'source']):
        '''
        gen_adv_search_sql(query[, smiles_dic[, abbr_dic[, black_words]]]) -- Returns a string.

        args:
            query: str, the query string.
            smiles_dic: dict, contains the molecules which may be used in the query.
            abbr_dic: dict, the dict of the abbreviated key words.
            black_words: list, the list of the dangerous words.

        returns a partial sql string like ' FROM ... WHERE ... '
        '''
        stats = []
        s = ''
        sub_mols = []
        sup_mols = []

        def gen_re_string(string):
            s = ''
            for i in str(string):
                s += '[' + i + i.upper() + ']'
            return s

        # check if query contains black words
        for i in black_words:
            if re.findall(gen_re_string(i), query):
                raise Exception, 'contains illegal words!'

        # upper case the sql keywords 'and', 'or'
        query = re.sub(r'([aA][nN][dD])|([oO][rR])',
                       lambda g: g.group(0).upper(), query) + ' '
        # add space before and after >, <, =, >=, etc.
        query = re.sub(r' *[><!=]+ *', lambda g: ' ' + g.group(0) + ' ', query)
        # unabbreviating
        r0 = re.compile(r'\w+')

        def repl0(g):
            if abbr_dic.has_key(g.group(0)):
                return abbr_dic[g.group(0)]
            else:
                return g.group(0)

        query = r0.sub(repl0, query)
        # RE for extracting the sub (substructure) part
        re_sub = re.compile(r'[sS][uU][bB] *[!=]+ *[^ )(]*(?=[ )(]+)')

        def repl_sub(g, gen_sql=self.gen_search_sql_tuple):
            sep = ''
            sql = ''
            smi = ''
            g = g.group(0)
            g = g.replace(' ', '')
            if re.findall(r'!=', g):
                sep = '!='
                type = '3'
            elif re.findall(r'=', g):
                sep = '='
                type = '2'
            mol_key = g.split(sep)[-1].strip(' ')
            if sep and smiles_dic.has_key(mol_key):
                smi = smiles_dic[mol_key]
                sql = gen_sql(str(smi), str(type), 1)
                stats.append(sql[0])
                return sql[1]

        # RE for extracting the sup (superstructure) part
        re_sup = re.compile(r'[sS][uU][pP] *[!=]+ *[^ )(]*(?=[ )(]+)')

        def repl_sup(g, gen_sql=self.gen_search_sql_tuple):
            sep = ''
            sql = ''
            smi = ''
            g = g.group(0)
            g = g.replace(' ', '')
            if re.findall(r'!=', g):
                sep = '!='
                type = '5'
            elif re.findall(r'=', g):
                sep = '='
                type = '4'
            mol_key = g.split(sep)[-1].strip(' ')
            if sep and smiles_dic.has_key(mol_key):
                smi = smiles_dic[mol_key]
                sql = gen_sql(str(smi), str(type), 1)
                stats.append(sql[0])
                return sql[1]

        # get the sub_mols
        for m in re_sub.findall(query):
            if re.findall(r'[^!]=', m):
                mol_key = m.split('=')[-1].strip(' ')
                if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key):
                    sub_mols.append(smiles_dic.get(mol_key))
        # get the sup_mols
        for m in re_sup.findall(query):
            if re.findall(r'[^!]=', m):
                mol_key = m.split('=')[-1].strip(' ')
                if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key):
                    sup_mols.append(smiles_dic.get(mol_key))
        # if the query has both sub_mols and sup_mols, then, the sub_mols must be
        # the substructure of sup_mols
        if sup_mols:
            for m in sub_mols:
                mymol_obj = mymol('smi', m)
                for m0 in sup_mols:
                    if not mymol_obj.sub_match('smi', m0):
                        raise Exception, 'sub and sup not match'

        result = re_sub.sub(repl_sub, query)
        result = re_sup.sub(repl_sup, result)

        tmp_dic_sub = {}
        tmp_dic_sup = {}
        stats = [j for j in stats if j]
        if stats:
            for i in stats:
                if 'sub' in i.keys():
                    if not tmp_dic_sub:
                        tmp_dic_sub = i['sub']
                        continue
                    else:
                        for k, v in i['sub'].items():
                            if tmp_dic_sub.has_key(k):
                                if tmp_dic_sub[k] < v:
                                    tmp_dic_sub[k] = v
                            else:
                                tmp_dic_sub[k] = v
                elif 'sup' in i.keys():
                    if not tmp_dic_sup:
                        tmp_dic_sup = i['sup']
                        continue
                    else:
                        for k, v in i['sup'].items():
                            if tmp_dic_sup.has_key(k):
                                if tmp_dic_sup[k] < v:
                                    tmp_dic_sup[k] = v
                            else:
                                tmp_dic_sup[k] = v

            val_k = ''
            for k, v in tmp_dic_sub.items():
                for j in self.def_dict.values():
                    if j.has_key(k):
                        val_k = j[k][0]
                        break
                if val_k:
                    if tmp_dic_sup.has_key(k):
                        if int(v) == int(tmp_dic_sup[k]):
                            s += '%s = %s AND ' % (str(val_k), str(v))
                        elif int(v) > int(tmp_dic_sup[k]):
                            raise Exception, 'mol stat range err'
                        else:
                            s += '%s >= %s AND %s <= %s AND ' % (
                                str(val_k), str(v), str(val_k),
                                str(tmp_dic_sup[k]))
                    else:
                        s += str(val_k) + ' >= ' + str(v) + ' AND '

            for k, v in tmp_dic_sup.items():
                if k in tmp_dic_sub.keys():
                    continue
                else:
                    for j in self.def_dict.values():
                        if j.has_key(k):
                            val_k = j[k][0]
                            break
                    if val_k:
                        s += str(val_k) + ' <= ' + str(v) + ' AND '

            result = '(%s) AND (%s)' % (s.rstrip(' AND '), result.strip(' '))

        join_part_sql = self.gen_join_part_sql(self.dbd.tables,
                                               self.env['PRI_FIELD'])
        return ' FROM %s WHERE (%s)' % (join_part_sql, result)
Пример #3
0
    def gen_search_sql_tuple(self, smiles, search_type="1", out_put_type=1):
        '''
        gen_search_sql_tuple(smiles[, search_type[, out_put_type]]) -- Returns a tuple

        args:
            smiles: str, smiles that represents a molecule.
            search_type: str. values:
                '1': full march,
                '2': target structure contains substructure,
                '3': target structure dosen't contains substructure
                '4': target structure contains superstructure,
                '5': target structure dosen't contains superstructure
            out_put_type: int. values:
                1 or not 2: without the mol_stat part.
                2: contains the mol_stat part

        generates the search sql string according to the smiles string.
        returns a tuple contains two elements:
            0: stat, some statistic of the mol such as number of C, number of rings and etc.
            1: the sql part, sql not fully generated, only a part like this " FROM ... WHERE ... "
        '''
        fp_sql = ""
        stat = {}
        head_flag = '('
        end_flag = ')'
        stat_part = ''
        md5_ob_can_smi_key = self.env['MD5_OPENBABEL_CAN_SMI_KEY']
        fp_bits_key = self.env['FP_BITS_KEY']
        sign_mol_stat = ' >= '
        try:
            num_H_key = self.env['NUM_H_KEY']
        except:
            num_H_key = 'NUM_H'
        bool_op = " AND "
        mol_fp = {}
        mol = mymol('smi', smiles)
        mol.get_fps(mol_fp)
        if search_type == "1":
            if md5_ob_can_smi_key in self.recorded_fields_dict.keys():
                return ({}, self.recorded_fields_dict[md5_ob_can_smi_key] +
                        " = '" +
                        md5(mymol('smi', smiles).gen_openbabel_can_smiles()) +
                        "'")
            else:
                return ({}, '')
        elif search_type in ("2", "4"):
            mol_stat = {}
            tmp_stat = {}
            mol.get_mol_stat(mol_stat)
            if mol_stat.has_key(num_H_key):
                mol_stat.pop(num_H_key)
            tmp_stat[str(fp_bits_key)] = mol_fp[fp_bits_key]
            tmp_stat.update(mol_stat)
            if search_type == '2':
                sign_mol_stat = ' >= '
                stat['sub'] = tmp_stat
            elif search_type == '4':
                sign_mol_stat = ' <= '
                stat['sup'] = tmp_stat
            bool_op = ' AND '
        elif search_type in ("3", "5"):
            bool_op = ' AND '
            head_flag = ' NOT ('
        fps_keys = mol_fp.items()
        fps_keys.sort()
        for i, j in fps_keys:
            if self.recorded_fields_dict.has_key(
                    i) and i != fp_bits_key and j != 0:
                if search_type in ('2', '3'):
                    fp_sql += self.recorded_fields_dict[i] + " & " + str(
                        j) + ' = ' + str(j) + bool_op
                elif search_type in ('4', '5'):
                    field = self.recorded_fields_dict[i]
                    fp_sql += field + " & " + str(j) + ' = ' + field + bool_op
        if out_put_type == 2 and search_type in ('2', '4'):
            val_k = []
            for k, v in tmp_stat.items():
                for j in self.def_dict.values():
                    if j.has_key(k):
                        val_k = j[k][0]
                        break
                if val_k:
                    stat_part += str(val_k) + sign_mol_stat + str(v) + ' AND '
            if stat_part:
                head_flag = '(' + stat_part.rstrip('AND ') + ') AND ('
        if fp_sql.rstrip(bool_op):
            return (stat, head_flag + fp_sql.rstrip(bool_op) + end_flag)
        else:
            return ({}, '1')
Пример #4
0
    def gen_adv_search_sql(
        self, query, smiles_dic={}, abbr_dic={}, black_words=["select", "insert", "update", "drop", "source"]
    ):

        """
        gen_adv_search_sql(query[, smiles_dic[, abbr_dic[, black_words]]]) -- Returns a string.

        args:
            query: str, the query string.
            smiles_dic: dict, contains the molecules which may be used in the query.
            abbr_dic: dict, the dict of the abbreviated key words.
            black_words: list, the list of the dangerous words.

        returns a partial sql string like ' FROM ... WHERE ... '
        """
        stats = []
        s = ""
        sub_mols = []
        sup_mols = []

        def gen_re_string(string):
            s = ""
            for i in str(string):
                s += "[" + i + i.upper() + "]"
            return s

        # check if query contains black words
        for i in black_words:
            if re.findall(gen_re_string(i), query):
                raise Exception, "contains illegal words!"

        # upper case the sql keywords 'and', 'or'
        query = re.sub(r"([aA][nN][dD])|([oO][rR])", lambda g: g.group(0).upper(), query) + " "
        # add space before and after >, <, =, >=, etc.
        query = re.sub(r" *[><!=]+ *", lambda g: " " + g.group(0) + " ", query)
        # unabbreviating
        r0 = re.compile(r"\w+")

        def repl0(g):
            if abbr_dic.has_key(g.group(0)):
                return abbr_dic[g.group(0)]
            else:
                return g.group(0)

        query = r0.sub(repl0, query)
        # RE for extracting the sub (substructure) part
        re_sub = re.compile(r"[sS][uU][bB] *[!=]+ *[^ )(]*(?=[ )(]+)")

        def repl_sub(g, gen_sql=self.gen_search_sql_tuple):
            sep = ""
            sql = ""
            smi = ""
            g = g.group(0)
            g = g.replace(" ", "")
            if re.findall(r"!=", g):
                sep = "!="
                type = "3"
            elif re.findall(r"=", g):
                sep = "="
                type = "2"
            mol_key = g.split(sep)[-1].strip(" ")
            if sep and smiles_dic.has_key(mol_key):
                smi = smiles_dic[mol_key]
                sql = gen_sql(str(smi), str(type), 1)
                stats.append(sql[0])
                return sql[1]

        # RE for extracting the sup (superstructure) part
        re_sup = re.compile(r"[sS][uU][pP] *[!=]+ *[^ )(]*(?=[ )(]+)")

        def repl_sup(g, gen_sql=self.gen_search_sql_tuple):
            sep = ""
            sql = ""
            smi = ""
            g = g.group(0)
            g = g.replace(" ", "")
            if re.findall(r"!=", g):
                sep = "!="
                type = "5"
            elif re.findall(r"=", g):
                sep = "="
                type = "4"
            mol_key = g.split(sep)[-1].strip(" ")
            if sep and smiles_dic.has_key(mol_key):
                smi = smiles_dic[mol_key]
                sql = gen_sql(str(smi), str(type), 1)
                stats.append(sql[0])
                return sql[1]

        # get the sub_mols
        for m in re_sub.findall(query):
            if re.findall(r"[^!]=", m):
                mol_key = m.split("=")[-1].strip(" ")
                if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key):
                    sub_mols.append(smiles_dic.get(mol_key))
        # get the sup_mols
        for m in re_sup.findall(query):
            if re.findall(r"[^!]=", m):
                mol_key = m.split("=")[-1].strip(" ")
                if smiles_dic.has_key(mol_key) and smiles_dic.get(mol_key):
                    sup_mols.append(smiles_dic.get(mol_key))
        # if the query has both sub_mols and sup_mols, then, the sub_mols must be
        # the substructure of sup_mols
        if sup_mols:
            for m in sub_mols:
                mymol_obj = mymol("smi", m)
                for m0 in sup_mols:
                    if not mymol_obj.sub_match("smi", m0):
                        raise Exception, "sub and sup not match"

        result = re_sub.sub(repl_sub, query)
        result = re_sup.sub(repl_sup, result)

        tmp_dic_sub = {}
        tmp_dic_sup = {}
        stats = [j for j in stats if j]
        if stats:
            for i in stats:
                if "sub" in i.keys():
                    if not tmp_dic_sub:
                        tmp_dic_sub = i["sub"]
                        continue
                    else:
                        for k, v in i["sub"].items():
                            if tmp_dic_sub.has_key(k):
                                if tmp_dic_sub[k] < v:
                                    tmp_dic_sub[k] = v
                            else:
                                tmp_dic_sub[k] = v
                elif "sup" in i.keys():
                    if not tmp_dic_sup:
                        tmp_dic_sup = i["sup"]
                        continue
                    else:
                        for k, v in i["sup"].items():
                            if tmp_dic_sup.has_key(k):
                                if tmp_dic_sup[k] < v:
                                    tmp_dic_sup[k] = v
                            else:
                                tmp_dic_sup[k] = v

            val_k = ""
            for k, v in tmp_dic_sub.items():
                for j in self.def_dict.values():
                    if j.has_key(k):
                        val_k = j[k][0]
                        break
                if val_k:
                    if tmp_dic_sup.has_key(k):
                        if int(v) == int(tmp_dic_sup[k]):
                            s += "%s = %s AND " % (str(val_k), str(v))
                        elif int(v) > int(tmp_dic_sup[k]):
                            raise Exception, "mol stat range err"
                        else:
                            s += "%s >= %s AND %s <= %s AND " % (str(val_k), str(v), str(val_k), str(tmp_dic_sup[k]))
                    else:
                        s += str(val_k) + " >= " + str(v) + " AND "

            for k, v in tmp_dic_sup.items():
                if k in tmp_dic_sub.keys():
                    continue
                else:
                    for j in self.def_dict.values():
                        if j.has_key(k):
                            val_k = j[k][0]
                            break
                    if val_k:
                        s += str(val_k) + " <= " + str(v) + " AND "

            result = "(%s) AND (%s)" % (s.rstrip(" AND "), result.strip(" "))

        join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env["PRI_FIELD"])
        return " FROM %s WHERE (%s)" % (join_part_sql, result)
Пример #5
0
    def gen_simi_search_sql(self, smiles, min_simi):
        """
        gen_simi_search_sql(smiles, min_simi) -- Returns a string.

        args:
            smiles: str, smiles that represents a molecule.
            min_simi: float, the minimal similarity value (tanimoto coefficient)

        tanimoto coefficient
        A is number of ones in fps of mol1
        B is number of ones in fps of mol2
        C is number of ones in A & B
        similarity = C/(A+B-C)

        returns a partial sql string like ' ,tanimoto as similarity FROM ... WHERE ... '
        where 'tanimoto' is the sql string for calculating tanimoto value and 'similarity'
        is the simi_field from env['SIMI_FIELD']
        """
        s = ""
        s_c = ""
        s_b = ""
        ones_a = 0
        stat = {}
        fps = {}
        min_simi = float(min_simi)
        mol = mymol("smi", smiles)
        mol.get_mol_stat(stat)
        mol.get_fps(fps)
        record_keys = ["NUM_C", "NUM_O", "NUM_N", "NUM_P", "NUM_S", "NUM_F", "NUM_Cl", "NUM_Br", "NUM_I"]
        for i in record_keys:
            if stat.has_key(i) and self.recorded_fields_dict.has_key(i):
                s += (
                    self.recorded_fields_dict[i]
                    + " >= "
                    + str((lambda x: x == int(x) and int(x) or int(x) + 1)(min_simi * stat[i]))
                    + " AND "
                    + self.recorded_fields_dict[i]
                    + " <= "
                    + str(int(1 / min_simi * stat[i]))
                    + " AND "
                )
        fps_keys = fps.keys()
        fps_keys.remove(self.env["FP_BITS_KEY"])
        fps_keys.sort()
        for i in fps_keys:
            if self.recorded_fields_dict.has_key(i):
                s_b += "BIT_COUNT(%s) + " % (self.recorded_fields_dict[i],)
                if fps[i] != 0:
                    s_c += "BIT_COUNT(%s & %s) + " % (self.recorded_fields_dict[i], fps[i])
                    # ones_a += str(bin(fps[i])).count('1')
        s_b = (lambda x: x and "( " + x + " )" or "")(s_b.rstrip("+ "))
        s_c = (lambda x: x and "( " + x + " )" or "")(s_c.rstrip("+ "))
        ones_a = fps[self.env["FP_BITS_KEY"]]

        tanimoto = "%s / ( %s + %s - %s )" % (s_c, str(ones_a), s_b, s_c)
        join_part_sql = self.gen_join_part_sql(self.dbd.tables, self.env["PRI_FIELD"])
        return ", %s as %s FROM %s WHERE ( (%s) AND (%s > %s) )" % (
            tanimoto,
            self.env["SIMI_FIELD"],
            join_part_sql,
            s.rstrip(" AND "),
            tanimoto,
            str(min_simi),
        )
Пример #6
0
    def gen_search_sql_tuple(self, smiles, search_type="1", out_put_type=1):
        """
        gen_search_sql_tuple(smiles[, search_type[, out_put_type]]) -- Returns a tuple

        args:
            smiles: str, smiles that represents a molecule.
            search_type: str. values:
                '1': full march,
                '2': target structure contains substructure,
                '3': target structure dosen't contains substructure
                '4': target structure contains superstructure,
                '5': target structure dosen't contains superstructure
            out_put_type: int. values:
                1 or not 2: without the mol_stat part.
                2: contains the mol_stat part

        generates the search sql string according to the smiles string.
        returns a tuple contains two elements:
            0: stat, some statistic of the mol such as number of C, number of rings and etc.
            1: the sql part, sql not fully generated, only a part like this " FROM ... WHERE ... "
        """
        fp_sql = ""
        stat = {}
        head_flag = "("
        end_flag = ")"
        stat_part = ""
        md5_ob_can_smi_key = self.env["MD5_OPENBABEL_CAN_SMI_KEY"]
        fp_bits_key = self.env["FP_BITS_KEY"]
        sign_mol_stat = " >= "
        try:
            num_H_key = self.env["NUM_H_KEY"]
        except:
            num_H_key = "NUM_H"
        bool_op = " AND "
        mol_fp = {}
        mol = mymol("smi", smiles)
        mol.get_fps(mol_fp)
        if search_type == "1":
            if md5_ob_can_smi_key in self.recorded_fields_dict.keys():
                return (
                    {},
                    self.recorded_fields_dict[md5_ob_can_smi_key]
                    + " = '"
                    + md5(mymol("smi", smiles).gen_openbabel_can_smiles())
                    + "'",
                )
            else:
                return ({}, "")
        elif search_type in ("2", "4"):
            mol_stat = {}
            tmp_stat = {}
            mol.get_mol_stat(mol_stat)
            if mol_stat.has_key(num_H_key):
                mol_stat.pop(num_H_key)
            tmp_stat[str(fp_bits_key)] = mol_fp[fp_bits_key]
            tmp_stat.update(mol_stat)
            if search_type == "2":
                sign_mol_stat = " >= "
                stat["sub"] = tmp_stat
            elif search_type == "4":
                sign_mol_stat = " <= "
                stat["sup"] = tmp_stat
            bool_op = " AND "
        elif search_type in ("3", "5"):
            bool_op = " AND "
            head_flag = " NOT ("
        fps_keys = mol_fp.items()
        fps_keys.sort()
        for i, j in fps_keys:
            if self.recorded_fields_dict.has_key(i) and i != fp_bits_key and j != 0:
                if search_type in ("2", "3"):
                    fp_sql += self.recorded_fields_dict[i] + " & " + str(j) + " = " + str(j) + bool_op
                elif search_type in ("4", "5"):
                    field = self.recorded_fields_dict[i]
                    fp_sql += field + " & " + str(j) + " = " + field + bool_op
        if out_put_type == 2 and search_type in ("2", "4"):
            val_k = []
            for k, v in tmp_stat.items():
                for j in self.def_dict.values():
                    if j.has_key(k):
                        val_k = j[k][0]
                        break
                if val_k:
                    stat_part += str(val_k) + sign_mol_stat + str(v) + " AND "
            if stat_part:
                head_flag = "(" + stat_part.rstrip("AND ") + ") AND ("
        if fp_sql.rstrip(bool_op):
            return (stat, head_flag + fp_sql.rstrip(bool_op) + end_flag)
        else:
            return ({}, "1")