def _parse_study_search_string(self, searchstr, only_with_processed_data=False): """parses string into SQL query for study search Parameters ---------- searchstr : str The string to parse only_with_processed_data : bool Whether or not to return studies with processed data. Returns ------- study_sql : str SQL query for selecting studies with the required metadata columns sample_sql : str SQL query for each study to get the sample ids that mach the query meta_headers : list metadata categories in the query string in alphabetical order Notes ----- All searches are case-sensitive References ---------- .. [1] McGuire P (2007) Getting started with pyparsing. """ # build the parse grammar category = Word(alphas + nums + "_") seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \ CaselessLiteral("startswith") value = Word(alphas + nums + "_" + ":" + ".") | \ dblQuotedString().setParseAction(removeQuotes) criterion = Group(category + seperator + value) criterion.setParseAction(SearchTerm) and_ = CaselessLiteral("and") or_ = CaselessLiteral("or") not_ = CaselessLiteral("not") optional_seps = Optional(and_ | or_ | not_) # create the grammar for parsing operators AND, OR, NOT search_expr = operatorPrecedence( criterion, [ (not_, 1, opAssoc.RIGHT, SearchNot), (and_, 2, opAssoc.LEFT, SearchAnd), (or_, 2, opAssoc.LEFT, SearchOr)]) # parse the search string to get out the SQL WHERE formatted query eval_stack = (search_expr + stringEnd).parseString(searchstr)[0] sql_where = eval_stack.generate_sql() # this lookup will be used to select only studies with columns # of the correct type type_lookup = {int: 'integer', float: 'float8', str: 'varchar'} # parse out all metadata headers we need to have in a study, and # their corresponding types all_headers = [c[0][0].term[0] for c in (criterion + optional_seps).scanString(searchstr)] meta_headers = set(all_headers) all_types = [c[0][0].term[2] for c in (criterion + optional_seps).scanString(searchstr)] all_types = [type_lookup[type(typecast_string(s))] for s in all_types] # sort headers and types so they return in same order every time. # Should be a relatively short list so very quick # argsort implementation taken from # http://stackoverflow.com/questions/3382352/ # equivalent-of-numpy-argsort-in-basic-python sort_order = sorted(range(len(all_headers)), key=all_headers.__getitem__) all_types = [all_types[x] for x in sort_order] all_headers.sort() # At this point it is possible that a metadata header has been # reference more than once in the query. If the types agree, then we # do not need to do anything. If the types do not agree (specifically, # if it appears to be numerical in one case and string in another), # then we need to give varchar the precedence. meta_header_type_lookup = dict() for header, header_type in zip(all_headers, all_types): if header not in meta_header_type_lookup: meta_header_type_lookup[header] = header_type else: if header_type == 'varchar' or \ meta_header_type_lookup[header] == 'varchar': meta_header_type_lookup[header] = 'varchar' # create the study finding SQL # remove metadata headers that are in required_sample_info table meta_headers = meta_headers.difference(self.required_cols).difference( self.study_cols) # get all study ids that contain all metadata categories searched for sql = [] if meta_headers: # have study-specific metadata, so need to find specific studies for meta in meta_headers: if meta_header_type_lookup[meta] in ('integer', 'float8'): allowable_types = "('integer', 'float8')" else: allowable_types = "('varchar')" sql.append("SELECT study_id FROM qiita.study_sample_columns " "WHERE lower(column_name) = lower('%s') and " "column_type in %s" % (scrub_data(meta), allowable_types)) else: # no study-specific metadata, so need all studies sql.append("SELECT study_id FROM qiita.study_sample_columns") # combine the query if only_with_processed_data: sql.append('SELECT study_id FROM qiita.study_processed_data') study_sql = ' INTERSECT '.join(sql) # create the sample finding SQL, getting both sample id and values # build the sql formatted list of metadata headers header_info = [] for meta in meta_header_type_lookup: if meta in self.required_cols: header_info.append("r.%s" % meta) elif meta in self.study_cols: header_info.append("st.%s" % meta) else: header_info.append("sa.%s" % meta) # build the SQL query sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info " "r JOIN qiita.sample_{0} sa ON sa.sample_id = " "r.sample_id JOIN qiita.study st ON st.study_id = " "r.study_id WHERE %s" % (','.join(header_info), sql_where)) return study_sql, sample_sql, meta_header_type_lookup.keys()
def test_scrub_data_single_quote(self): """Correctly removes single quotes from the string""" self.assertEqual(scrub_data("'quotes'"), "quotes")
def __init__(self, tokens): self.term = tokens[0] # clean all the inputs for pos, term in enumerate(self.term): self.term[pos] = scrub_data(term)
def test_scrub_data_nothing(self): """Returns the same string without changes""" self.assertEqual(scrub_data("nothing_changes"), "nothing_changes")
def test_scrub_data_semicolon(self): """Correctly removes the semicolon from the string""" self.assertEqual(scrub_data("remove_;_char"), "remove__char")
def _parse_study_search_string(self, searchstr, only_with_processed_data=False): """parses string into SQL query for study search Parameters ---------- searchstr : str The string to parse only_with_processed_data : bool Whether or not to return studies with processed data. Returns ------- study_sql : str SQL query for selecting studies with the required metadata columns sample_sql : str SQL query for each study to get the sample ids that mach the query meta_headers : list metadata categories in the query string in alphabetical order Notes ----- All searches are case-sensitive References ---------- .. [1] McGuire P (2007) Getting started with pyparsing. """ # build the parse grammar category = Word(alphas + nums + "_") seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \ CaselessLiteral("startswith") value = Word(alphas + nums + "_" + ":" + ".") | \ dblQuotedString().setParseAction(removeQuotes) criterion = Group(category + seperator + value) criterion.setParseAction(SearchTerm) and_ = CaselessLiteral("and") or_ = CaselessLiteral("or") not_ = CaselessLiteral("not") optional_seps = Optional(and_ | or_ | not_) # create the grammar for parsing operators AND, OR, NOT search_expr = operatorPrecedence(criterion, [(not_, 1, opAssoc.RIGHT, SearchNot), (and_, 2, opAssoc.LEFT, SearchAnd), (or_, 2, opAssoc.LEFT, SearchOr)]) # parse the search string to get out the SQL WHERE formatted query eval_stack = (search_expr + stringEnd).parseString(searchstr)[0] sql_where = eval_stack.generate_sql() # this lookup will be used to select only studies with columns # of the correct type type_lookup = {int: 'integer', float: 'float8', str: 'varchar'} # parse out all metadata headers we need to have in a study, and # their corresponding types all_headers = [ c[0][0].term[0] for c in (criterion + optional_seps).scanString(searchstr) ] meta_headers = set(all_headers) all_types = [ c[0][0].term[2] for c in (criterion + optional_seps).scanString(searchstr) ] all_types = [type_lookup[type(typecast_string(s))] for s in all_types] # sort headers and types so they return in same order every time. # Should be a relatively short list so very quick # argsort implementation taken from # http://stackoverflow.com/questions/3382352/ # equivalent-of-numpy-argsort-in-basic-python sort_order = sorted(range(len(all_headers)), key=all_headers.__getitem__) all_types = [all_types[x] for x in sort_order] all_headers.sort() # At this point it is possible that a metadata header has been # reference more than once in the query. If the types agree, then we # do not need to do anything. If the types do not agree (specifically, # if it appears to be numerical in one case and string in another), # then we need to give varchar the precedence. meta_header_type_lookup = dict() for header, header_type in zip(all_headers, all_types): if header not in meta_header_type_lookup: meta_header_type_lookup[header] = header_type else: if header_type == 'varchar' or \ meta_header_type_lookup[header] == 'varchar': meta_header_type_lookup[header] = 'varchar' # create the study finding SQL # remove metadata headers that are in required_sample_info table meta_headers = meta_headers.difference(self.required_cols).difference( self.study_cols) # get all study ids that contain all metadata categories searched for sql = [] if meta_headers: # have study-specific metadata, so need to find specific studies for meta in meta_headers: if meta_header_type_lookup[meta] in ('integer', 'float8'): allowable_types = "('integer', 'float8')" else: allowable_types = "('varchar')" sql.append("SELECT study_id FROM qiita.study_sample_columns " "WHERE lower(column_name) = lower('%s') and " "column_type in %s" % (scrub_data(meta), allowable_types)) else: # no study-specific metadata, so need all studies sql.append("SELECT study_id FROM qiita.study_sample_columns") # combine the query if only_with_processed_data: sql.append('SELECT study_id FROM qiita.study_processed_data') study_sql = ' INTERSECT '.join(sql) # create the sample finding SQL, getting both sample id and values # build the sql formatted list of metadata headers header_info = [] for meta in meta_header_type_lookup: if meta in self.required_cols: header_info.append("r.%s" % meta) elif meta in self.study_cols: header_info.append("st.%s" % meta) else: header_info.append("sa.%s" % meta) # build the SQL query sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info " "r JOIN qiita.sample_{0} sa ON sa.sample_id = " "r.sample_id JOIN qiita.study st ON st.study_id = " "r.study_id WHERE %s" % (','.join(header_info), sql_where)) return study_sql, sample_sql, meta_header_type_lookup.keys()