def generate_sql(self): # we can assume that the metadata is either in required_sample_info # or the study-specific table column_name, operator, argument = self.term argument_type = type(convert_type(argument)) allowable_types = {int: {'<', '<=', '=', '>=', '>'}, float: {'<', '<=', '=', '>=', '>'}, str: {'=', 'includes', 'startswith'}} if operator not in allowable_types[argument_type]: raise QiitaDBIncompatibleDatatypeError(operator, argument_type) if column_name in self.required_cols: column_name = "r.%s" % column_name.lower() elif column_name in self.study_cols: column_name = "st.%s" % column_name.lower() else: column_name = "sa.%s" % column_name.lower() if operator == "includes": # substring search, so create proper query for it return "LOWER(%s) LIKE '%%%s%%'" % (column_name, argument.lower()) else: # standard query so just return it, adding quotes if string if argument_type == str: argument = ''.join(("'", argument, "'")) return ' '.join([column_name, operator, argument])
def _parse_study_search_string(self, searchstr, only_with_processed_data=False): """parses string into SQL query for study search Parameters ---------- searchstr : str The string to parse only_with_processed_data : bool Whether or not to return studies with processed data. Returns ------- study_sql : str SQL query for selecting studies with the required metadata columns sample_sql : str SQL query for each study to get the sample ids that mach the query meta_headers : list metadata categories in the query string in alphabetical order Notes ----- All searches are case-sensitive References ---------- .. [1] McGuire P (2007) Getting started with pyparsing. """ # build the parse grammar category = Word(alphas + nums + "_") seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \ CaselessLiteral("startswith") value = Word(alphas + nums + "_" + ":" + ".") | \ dblQuotedString().setParseAction(removeQuotes) criterion = Group(category + seperator + value) criterion.setParseAction(SearchTerm) and_ = CaselessLiteral("and") or_ = CaselessLiteral("or") not_ = CaselessLiteral("not") optional_seps = Optional(and_ | or_ | not_) # create the grammar for parsing operators AND, OR, NOT search_expr = operatorPrecedence( criterion, [ (not_, 1, opAssoc.RIGHT, SearchNot), (and_, 2, opAssoc.LEFT, SearchAnd), (or_, 2, opAssoc.LEFT, SearchOr)]) # parse the search string to get out the SQL WHERE formatted query eval_stack = (search_expr + stringEnd).parseString(searchstr)[0] sql_where = eval_stack.generate_sql() # this lookup will be used to select only studies with columns # of the correct type type_lookup = {int: 'integer', float: 'float8', str: 'varchar'} # parse out all metadata headers we need to have in a study, and # their corresponding types all_headers = [c[0][0].term[0] for c in (criterion + optional_seps).scanString(searchstr)] meta_headers = set(all_headers) all_types = [c[0][0].term[2] for c in (criterion + optional_seps).scanString(searchstr)] all_types = [type_lookup[type(convert_type(s))] for s in all_types] # sort headers and types so they return in same order every time. # Should be a relatively short list so very quick # argsort implementation taken from # http://stackoverflow.com/questions/3382352/ # equivalent-of-numpy-argsort-in-basic-python sort_order = sorted(range(len(all_headers)), key=all_headers.__getitem__) all_types = [all_types[x] for x in sort_order] all_headers.sort() # At this point it is possible that a metadata header has been # reference more than once in the query. If the types agree, then we # do not need to do anything. If the types do not agree (specifically, # if it appears to be numerical in one case and string in another), # then we need to give varchar the precedence. meta_header_type_lookup = dict() for header, header_type in zip(all_headers, all_types): if header not in meta_header_type_lookup: meta_header_type_lookup[header] = header_type else: if header_type == 'varchar' or \ meta_header_type_lookup[header] == 'varchar': meta_header_type_lookup[header] = 'varchar' # create the study finding SQL # remove metadata headers that are in required_sample_info table meta_headers = tuple(meta_headers.difference( self.required_cols).difference(self.study_cols)) # get all study ids that contain all metadata categories searched for sql = [] if meta_headers: # have study-specific metadata, so need to find specific studies for meta in meta_headers: if meta_header_type_lookup[meta] in ('integer', 'float8'): allowable_types = "('integer', 'float8')" else: allowable_types = "('varchar')" sql.append("SELECT study_id FROM qiita.study_sample_columns " "WHERE lower(column_name) = lower('%s') and " "column_type in %s" % (scrub_data(meta), allowable_types)) else: # no study-specific metadata, so need all studies sql.append("SELECT study_id FROM qiita.study_sample_columns") # combine the query if only_with_processed_data: sql.append('SELECT study_id FROM qiita.study_processed_data') study_sql = ' INTERSECT '.join(sql) # create the sample finding SQL, getting both sample id and values # build the sql formatted list of metadata headers header_info = [] for meta in meta_header_type_lookup: if meta in self.required_cols: header_info.append("r.%s" % meta) elif meta in self.study_cols: header_info.append("st.%s" % meta) else: header_info.append("sa.%s" % meta) # build the SQL query sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info " "r JOIN qiita.sample_{0} sa ON sa.sample_id = " "r.sample_id JOIN qiita.study st ON st.study_id = " "r.study_id WHERE %s" % (','.join(header_info), sql_where)) return study_sql, sample_sql, meta_header_type_lookup.keys()