Пример #1
0
    def generate_sql(self):
        # we can assume that the metadata is either in required_sample_info
        # or the study-specific table
        column_name, operator, argument = self.term
        argument_type = type(convert_type(argument))

        allowable_types = {int: {'<', '<=', '=', '>=', '>'},
                           float: {'<', '<=', '=', '>=', '>'},
                           str: {'=', 'includes', 'startswith'}}

        if operator not in allowable_types[argument_type]:
            raise QiitaDBIncompatibleDatatypeError(operator, argument_type)

        if column_name in self.required_cols:
            column_name = "r.%s" % column_name.lower()
        elif column_name in self.study_cols:
            column_name = "st.%s" % column_name.lower()
        else:
            column_name = "sa.%s" % column_name.lower()

        if operator == "includes":
            # substring search, so create proper query for it
            return "LOWER(%s) LIKE '%%%s%%'" % (column_name, argument.lower())
        else:
            # standard query so just return it, adding quotes if string
            if argument_type == str:
                argument = ''.join(("'", argument, "'"))
            return ' '.join([column_name, operator, argument])
Пример #2
0
    def _parse_study_search_string(self, searchstr,
                                   only_with_processed_data=False):
        """parses string into SQL query for study search

        Parameters
        ----------
        searchstr : str
            The string to parse
        only_with_processed_data : bool
            Whether or not to return studies with processed data.

        Returns
        -------
        study_sql : str
            SQL query for selecting studies with the required metadata columns
        sample_sql : str
            SQL query for each study to get the sample ids that mach the query
        meta_headers : list
            metadata categories in the query string in alphabetical order

        Notes
        -----
        All searches are case-sensitive

        References
        ----------
        .. [1] McGuire P (2007) Getting started with pyparsing.
        """
        # build the parse grammar
        category = Word(alphas + nums + "_")
        seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \
            CaselessLiteral("startswith")
        value = Word(alphas + nums + "_" + ":" + ".") | \
            dblQuotedString().setParseAction(removeQuotes)
        criterion = Group(category + seperator + value)
        criterion.setParseAction(SearchTerm)
        and_ = CaselessLiteral("and")
        or_ = CaselessLiteral("or")
        not_ = CaselessLiteral("not")
        optional_seps = Optional(and_ | or_ | not_)

        # create the grammar for parsing operators AND, OR, NOT
        search_expr = operatorPrecedence(
            criterion, [
                (not_, 1, opAssoc.RIGHT, SearchNot),
                (and_, 2, opAssoc.LEFT, SearchAnd),
                (or_, 2, opAssoc.LEFT, SearchOr)])

        # parse the search string to get out the SQL WHERE formatted query
        eval_stack = (search_expr + stringEnd).parseString(searchstr)[0]
        sql_where = eval_stack.generate_sql()

        # this lookup will be used to select only studies with columns
        # of the correct type
        type_lookup = {int: 'integer', float: 'float8', str: 'varchar'}

        # parse out all metadata headers we need to have in a study, and
        # their corresponding types
        all_headers = [c[0][0].term[0] for c in
                       (criterion + optional_seps).scanString(searchstr)]
        meta_headers = set(all_headers)
        all_types = [c[0][0].term[2] for c in
                     (criterion + optional_seps).scanString(searchstr)]
        all_types = [type_lookup[type(convert_type(s))] for s in all_types]

        # sort headers and types so they return in same order every time.
        # Should be a relatively short list so very quick
        # argsort implementation taken from
        # http://stackoverflow.com/questions/3382352/
        # equivalent-of-numpy-argsort-in-basic-python
        sort_order = sorted(range(len(all_headers)),
                            key=all_headers.__getitem__)
        all_types = [all_types[x] for x in sort_order]
        all_headers.sort()

        # At this point it is possible that a metadata header has been
        # reference more than once in the query. If the types agree, then we
        # do not need to do anything. If the types do not agree (specifically,
        # if it appears to be numerical in one case and string in another),
        # then we need to give varchar the precedence.
        meta_header_type_lookup = dict()
        for header, header_type in zip(all_headers, all_types):
            if header not in meta_header_type_lookup:
                meta_header_type_lookup[header] = header_type
            else:
                if header_type == 'varchar' or \
                        meta_header_type_lookup[header] == 'varchar':
                    meta_header_type_lookup[header] = 'varchar'

        # create the study finding SQL
        # remove metadata headers that are in required_sample_info table
        meta_headers = tuple(meta_headers.difference(
            self.required_cols).difference(self.study_cols))

        # get all study ids that contain all metadata categories searched for
        sql = []
        if meta_headers:
            # have study-specific metadata, so need to find specific studies
            for meta in meta_headers:
                if meta_header_type_lookup[meta] in ('integer', 'float8'):
                    allowable_types = "('integer', 'float8')"
                else:
                    allowable_types = "('varchar')"

                sql.append("SELECT study_id FROM qiita.study_sample_columns "
                           "WHERE lower(column_name) = lower('%s') and "
                           "column_type in %s" %
                           (scrub_data(meta), allowable_types))
        else:
            # no study-specific metadata, so need all studies
            sql.append("SELECT study_id FROM qiita.study_sample_columns")

        # combine the query
        if only_with_processed_data:
            sql.append('SELECT study_id FROM qiita.study_processed_data')
        study_sql = ' INTERSECT '.join(sql)

        # create  the sample finding SQL, getting both sample id and values
        # build the sql formatted list of metadata headers
        header_info = []
        for meta in meta_header_type_lookup:
            if meta in self.required_cols:
                header_info.append("r.%s" % meta)
            elif meta in self.study_cols:
                header_info.append("st.%s" % meta)
            else:
                header_info.append("sa.%s" % meta)
        # build the SQL query
        sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info "
                      "r JOIN qiita.sample_{0} sa ON sa.sample_id = "
                      "r.sample_id JOIN qiita.study st ON st.study_id = "
                      "r.study_id WHERE %s" %
                      (','.join(header_info), sql_where))
        return study_sql, sample_sql, meta_header_type_lookup.keys()