Python scrub_data示例，qiita_db.util.scrub_data Python示例

示例#1

0

显示文件

文件： search.py 项目： BrindhaBioinfo/qiita

    def _parse_study_search_string(self, searchstr,
                                   only_with_processed_data=False):
        """parses string into SQL query for study search

        Parameters
        ----------
        searchstr : str
            The string to parse
        only_with_processed_data : bool
            Whether or not to return studies with processed data.

        Returns
        -------
        study_sql : str
            SQL query for selecting studies with the required metadata columns
        sample_sql : str
            SQL query for each study to get the sample ids that mach the query
        meta_headers : list
            metadata categories in the query string in alphabetical order

        Notes
        -----
        All searches are case-sensitive

        References
        ----------
        .. [1] McGuire P (2007) Getting started with pyparsing.
        """
        # build the parse grammar
        category = Word(alphas + nums + "_")
        seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \
            CaselessLiteral("startswith")
        value = Word(alphas + nums + "_" + ":" + ".") | \
            dblQuotedString().setParseAction(removeQuotes)
        criterion = Group(category + seperator + value)
        criterion.setParseAction(SearchTerm)
        and_ = CaselessLiteral("and")
        or_ = CaselessLiteral("or")
        not_ = CaselessLiteral("not")
        optional_seps = Optional(and_ | or_ | not_)

        # create the grammar for parsing operators AND, OR, NOT
        search_expr = operatorPrecedence(
            criterion, [
                (not_, 1, opAssoc.RIGHT, SearchNot),
                (and_, 2, opAssoc.LEFT, SearchAnd),
                (or_, 2, opAssoc.LEFT, SearchOr)])

        # parse the search string to get out the SQL WHERE formatted query
        eval_stack = (search_expr + stringEnd).parseString(searchstr)[0]
        sql_where = eval_stack.generate_sql()

        # this lookup will be used to select only studies with columns
        # of the correct type
        type_lookup = {int: 'integer', float: 'float8', str: 'varchar'}

        # parse out all metadata headers we need to have in a study, and
        # their corresponding types
        all_headers = [c[0][0].term[0] for c in
                       (criterion + optional_seps).scanString(searchstr)]
        meta_headers = set(all_headers)
        all_types = [c[0][0].term[2] for c in
                     (criterion + optional_seps).scanString(searchstr)]
        all_types = [type_lookup[type(typecast_string(s))] for s in all_types]

        # sort headers and types so they return in same order every time.
        # Should be a relatively short list so very quick
        # argsort implementation taken from
        # http://stackoverflow.com/questions/3382352/
        # equivalent-of-numpy-argsort-in-basic-python
        sort_order = sorted(range(len(all_headers)),
                            key=all_headers.__getitem__)
        all_types = [all_types[x] for x in sort_order]
        all_headers.sort()

        # At this point it is possible that a metadata header has been
        # reference more than once in the query. If the types agree, then we
        # do not need to do anything. If the types do not agree (specifically,
        # if it appears to be numerical in one case and string in another),
        # then we need to give varchar the precedence.
        meta_header_type_lookup = dict()
        for header, header_type in zip(all_headers, all_types):
            if header not in meta_header_type_lookup:
                meta_header_type_lookup[header] = header_type
            else:
                if header_type == 'varchar' or \
                        meta_header_type_lookup[header] == 'varchar':
                    meta_header_type_lookup[header] = 'varchar'

        # create the study finding SQL
        # remove metadata headers that are in required_sample_info table
        meta_headers = meta_headers.difference(self.required_cols).difference(
            self.study_cols)

        # get all study ids that contain all metadata categories searched for
        sql = []
        if meta_headers:
            # have study-specific metadata, so need to find specific studies
            for meta in meta_headers:
                if meta_header_type_lookup[meta] in ('integer', 'float8'):
                    allowable_types = "('integer', 'float8')"
                else:
                    allowable_types = "('varchar')"

                sql.append("SELECT study_id FROM qiita.study_sample_columns "
                           "WHERE lower(column_name) = lower('%s') and "
                           "column_type in %s" %
                           (scrub_data(meta), allowable_types))
        else:
            # no study-specific metadata, so need all studies
            sql.append("SELECT study_id FROM qiita.study_sample_columns")

        # combine the query
        if only_with_processed_data:
            sql.append('SELECT study_id FROM qiita.study_processed_data')
        study_sql = ' INTERSECT '.join(sql)

        # create  the sample finding SQL, getting both sample id and values
        # build the sql formatted list of metadata headers
        header_info = []
        for meta in meta_header_type_lookup:
            if meta in self.required_cols:
                header_info.append("r.%s" % meta)
            elif meta in self.study_cols:
                header_info.append("st.%s" % meta)
            else:
                header_info.append("sa.%s" % meta)
        # build the SQL query
        sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info "
                      "r JOIN qiita.sample_{0} sa ON sa.sample_id = "
                      "r.sample_id JOIN qiita.study st ON st.study_id = "
                      "r.study_id WHERE %s" %
                      (','.join(header_info), sql_where))
        return study_sql, sample_sql, meta_header_type_lookup.keys()

示例#2

0

显示文件

文件： test_util.py 项目： Jorge-C/qiita

 def test_scrub_data_single_quote(self):
     """Correctly removes single quotes from the string"""
     self.assertEqual(scrub_data("'quotes'"), "quotes")

示例#3

0

显示文件

文件： search.py 项目： BrindhaBioinfo/qiita

 def __init__(self, tokens):
     self.term = tokens[0]
     # clean all the inputs
     for pos, term in enumerate(self.term):
         self.term[pos] = scrub_data(term)

示例#4

0

显示文件

文件： test_util.py 项目： Jorge-C/qiita

 def test_scrub_data_nothing(self):
     """Returns the same string without changes"""
     self.assertEqual(scrub_data("nothing_changes"), "nothing_changes")

示例#5

0

显示文件

文件： test_util.py 项目： Jorge-C/qiita

 def test_scrub_data_semicolon(self):
     """Correctly removes the semicolon from the string"""
     self.assertEqual(scrub_data("remove_;_char"), "remove__char")

示例#6

0

显示文件

    def _parse_study_search_string(self,
                                   searchstr,
                                   only_with_processed_data=False):
        """parses string into SQL query for study search

        Parameters
        ----------
        searchstr : str
            The string to parse
        only_with_processed_data : bool
            Whether or not to return studies with processed data.

        Returns
        -------
        study_sql : str
            SQL query for selecting studies with the required metadata columns
        sample_sql : str
            SQL query for each study to get the sample ids that mach the query
        meta_headers : list
            metadata categories in the query string in alphabetical order

        Notes
        -----
        All searches are case-sensitive

        References
        ----------
        .. [1] McGuire P (2007) Getting started with pyparsing.
        """
        # build the parse grammar
        category = Word(alphas + nums + "_")
        seperator = oneOf("> < = >= <= !=") | CaselessLiteral("includes") | \
            CaselessLiteral("startswith")
        value = Word(alphas + nums + "_" + ":" + ".") | \
            dblQuotedString().setParseAction(removeQuotes)
        criterion = Group(category + seperator + value)
        criterion.setParseAction(SearchTerm)
        and_ = CaselessLiteral("and")
        or_ = CaselessLiteral("or")
        not_ = CaselessLiteral("not")
        optional_seps = Optional(and_ | or_ | not_)

        # create the grammar for parsing operators AND, OR, NOT
        search_expr = operatorPrecedence(criterion,
                                         [(not_, 1, opAssoc.RIGHT, SearchNot),
                                          (and_, 2, opAssoc.LEFT, SearchAnd),
                                          (or_, 2, opAssoc.LEFT, SearchOr)])

        # parse the search string to get out the SQL WHERE formatted query
        eval_stack = (search_expr + stringEnd).parseString(searchstr)[0]
        sql_where = eval_stack.generate_sql()

        # this lookup will be used to select only studies with columns
        # of the correct type
        type_lookup = {int: 'integer', float: 'float8', str: 'varchar'}

        # parse out all metadata headers we need to have in a study, and
        # their corresponding types
        all_headers = [
            c[0][0].term[0]
            for c in (criterion + optional_seps).scanString(searchstr)
        ]
        meta_headers = set(all_headers)
        all_types = [
            c[0][0].term[2]
            for c in (criterion + optional_seps).scanString(searchstr)
        ]
        all_types = [type_lookup[type(typecast_string(s))] for s in all_types]

        # sort headers and types so they return in same order every time.
        # Should be a relatively short list so very quick
        # argsort implementation taken from
        # http://stackoverflow.com/questions/3382352/
        # equivalent-of-numpy-argsort-in-basic-python
        sort_order = sorted(range(len(all_headers)),
                            key=all_headers.__getitem__)
        all_types = [all_types[x] for x in sort_order]
        all_headers.sort()

        # At this point it is possible that a metadata header has been
        # reference more than once in the query. If the types agree, then we
        # do not need to do anything. If the types do not agree (specifically,
        # if it appears to be numerical in one case and string in another),
        # then we need to give varchar the precedence.
        meta_header_type_lookup = dict()
        for header, header_type in zip(all_headers, all_types):
            if header not in meta_header_type_lookup:
                meta_header_type_lookup[header] = header_type
            else:
                if header_type == 'varchar' or \
                        meta_header_type_lookup[header] == 'varchar':
                    meta_header_type_lookup[header] = 'varchar'

        # create the study finding SQL
        # remove metadata headers that are in required_sample_info table
        meta_headers = meta_headers.difference(self.required_cols).difference(
            self.study_cols)

        # get all study ids that contain all metadata categories searched for
        sql = []
        if meta_headers:
            # have study-specific metadata, so need to find specific studies
            for meta in meta_headers:
                if meta_header_type_lookup[meta] in ('integer', 'float8'):
                    allowable_types = "('integer', 'float8')"
                else:
                    allowable_types = "('varchar')"

                sql.append("SELECT study_id FROM qiita.study_sample_columns "
                           "WHERE lower(column_name) = lower('%s') and "
                           "column_type in %s" %
                           (scrub_data(meta), allowable_types))
        else:
            # no study-specific metadata, so need all studies
            sql.append("SELECT study_id FROM qiita.study_sample_columns")

        # combine the query
        if only_with_processed_data:
            sql.append('SELECT study_id FROM qiita.study_processed_data')
        study_sql = ' INTERSECT '.join(sql)

        # create  the sample finding SQL, getting both sample id and values
        # build the sql formatted list of metadata headers
        header_info = []
        for meta in meta_header_type_lookup:
            if meta in self.required_cols:
                header_info.append("r.%s" % meta)
            elif meta in self.study_cols:
                header_info.append("st.%s" % meta)
            else:
                header_info.append("sa.%s" % meta)
        # build the SQL query
        sample_sql = ("SELECT r.sample_id,%s FROM qiita.required_sample_info "
                      "r JOIN qiita.sample_{0} sa ON sa.sample_id = "
                      "r.sample_id JOIN qiita.study st ON st.study_id = "
                      "r.study_id WHERE %s" %
                      (','.join(header_info), sql_where))
        return study_sql, sample_sql, meta_header_type_lookup.keys()

示例#7

0

显示文件

 def __init__(self, tokens):
     self.term = tokens[0]
     # clean all the inputs
     for pos, term in enumerate(self.term):
         self.term[pos] = scrub_data(term)

示例#8

0

显示文件

 def test_scrub_data_single_quote(self):
     """Correctly removes single quotes from the string"""
     self.assertEqual(scrub_data("'quotes'"), "quotes")

示例#9

0

显示文件

 def test_scrub_data_semicolon(self):
     """Correctly removes the semicolon from the string"""
     self.assertEqual(scrub_data("remove_;_char"), "remove__char")

示例#10

0

显示文件

 def test_scrub_data_nothing(self):
     """Returns the same string without changes"""
     self.assertEqual(scrub_data("nothing_changes"), "nothing_changes")