Пример #1
0
 def refine_queries(self, agg_result):
     """
     Takes in 'agg_result' which is the result from the aggregator
     for this BOQ.
     Selects which queries should be recorded in the results database. 
     To do this it creates a new list of associated selected queries
     and pairs them with their results. 
     """
     #selecting queries that match.
     queries = []
     assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
     for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
         assert q
         assert r
         assert q[qs.QRY_QID] >= r[qs.QRY_QID]
         potential_queries = []
         for (value,
              value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems():
             count = len(value_result)
             if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\
                qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count:
                 (value, where) = self.format_value_and_where(
                     sv.sql_name_to_enum(q[qs.QRY_FIELD]), value)
                 q[qs.QRY_VALUE] = value
                 q[qs.QRY_WHERECLAUSE] = where
                 r[rdb.DBF_MATCHINGRECORDIDS] = value_result
                 potential_queries.append((q, r))
         if potential_queries:
             chosen_q = random.sample(potential_queries, 1)[0]
             chosen_q[0][qs.QRY_QID] = \
                 qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID],
                                               chosen_q[0][qs.QRY_WHERECLAUSE])
             queries.append(chosen_q)
     #capping at choose-num number of queries
     self.refined_queries_results = queries
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
        #set up intitialization values 
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        class Options(object):
            pass
        learner_options = Options()
        learner_options.verbose = False

        pums_files = \
            [("mock pums", 
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
            
        names_files = \
            [('male_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt', 
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(learner_options,
                                                 self.dummy_logger,
                                                 names_files)

        vars = [sv.VARS.SEX,
                sv.VARS.CITIZENSHIP,
                sv.VARS.AGE,
                sv.VARS.RACE,
                sv.VARS.STATE,
                sv.VARS.FIRST_NAME,
                sv.VARS.LAST_NAME]

        var_order = vars
        var_names = [sv.VARS.to_string(x) for x in vars]
        dist_dict = { }
        dist_dict.update(pums_dict)
        dist_dict.update(names_dict)
        
        dist_holder = dh.DistributionHolder(var_order, var_names, dist_dict)
        
        fields = [sv.sql_name_to_enum('xml')]

        self._dist1 = xml_generator.XmlGenerator(dist_holder)
        dists = [self._dist1]
        other_fields = ['no_queries', 'r_lower', 'r_upper', 'path_type'] 
        other_cols_full = [[5, 1, 10, 'full']]
        other_cols_short = [[5,1, 10,'short']]
        self.full_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000,
                                                    100,other_fields, other_cols_full)
        self.short_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000,
                                                    100,other_fields, other_cols_short)
Пример #3
0
 def __init__(self, query):
     self._qid = query[qs.QRY_QID]
     self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD])
     self._alarmwords = set([query[qs.QRY_ALARMWORDONE], query[qs.QRY_ALARMWORDTWO]])
     self._alarmword_distance = query[qs.QRY_ALARMWORDDISTANCE]
     self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM])
     self._count = 0
Пример #4
0
 def _generate_short_queries(self, dist, q):
     '''
     Generates queries of the form .//LEAF
     '''
     query_dicts = []
     query_count = 0
     for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
         self.__count += 1
         query_cout = count
         LOGGER.info('P11: Created %d out of %d queries' % \
                     (self.__count, self.__total))
         r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         (field, value) = self._create_equality_leaf(dist, r_lower, r_upper)
         value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
         try:
             value = value.replace('\'', '\'\'')
         except TypeError:
             pass
         except AttributeError:
             pass
         if field in ['foo', 'age', 'income']:
             where = "xml_value(xml,\'//%s\', %s)" % (field, value)
         else:
             where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value)
         xpath = field
         qid = qids.query_id()
         if qid != qids.full_where_has_been_seen(qid, where):
             continue
         query_dicts.append({
             qs.QRY_ENUM: qs.CAT.P11_SHORT,
             qs.QRY_QID: qid,
             qs.QRY_DBNUMRECORDS: self.__db_size,
             qs.QRY_DBRECORDSIZE: self.__row_width,
             qs.QRY_PERF: self.__perf,
             qs.QRY_CAT: self.__cat,
             qs.QRY_SUBCAT: 'eq-double-slash',
             qs.QRY_WHERECLAUSE: where,
             qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
             qs.QRY_NEGATE: False,
             qs.QRY_FIELDTYPE: 'string',
             qs.QRY_LRSS: q[qs.QRY_LRSS],
             qs.QRY_URSS: q[qs.QRY_URSS],
             qs.QRY_VALUE: value,
             qs.QRY_XPATH: xpath
         })
     return aqb.XmlQueryBatch(
         query_dicts, query_count,
         max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
 def setUp(self):
     self.seed = int(time.time())
     self.seed_msg = "Random seed used for this test: %s" % self.seed
     self.longMessage = True
     spar_random.seed(self.seed)
     #set up intitialization values 
     sub_cat = 'foo-range'
     self._foo_dist = bespoke_distribution.FooDistribution()
     fields = [sv.sql_name_to_enum('foo')]
     dists = [self._foo_dist]
     other_fields = ['no_queries', 'r_lower', 'r_upper','r_exp_lower','r_exp_upper','type']
     other_cols = [[2, 1, 100, 21, 21, 'range'], [2,1, 100,32, 32,'range'],
                   [2, 1, 200, 21, 21,'greater'],[2,1, 200,25, 25,'greater']]
     self.generator = frqg.FooRangeQueryGenerator('P2',sub_cat, ["LL"],dists, fields, 50000,
                                                  100,other_fields, other_cols)
Пример #6
0
 def __init__(self, query):        
     ''' Initialize the needed class variables from the query '''
     self._qid = query[qs.QRY_QID]
     self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD])
     # try/except block is mostly for backwards compatability
     # with unit tests
     try:
         self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM])
     except KeyError:
         self._process_cutoff = 100000
     self._count = 0
     #If the query is atomic (i.e. top level), we want to apply a limit
     #on what it can collect, otherwise we want no process limit in
     #effect
     try:
         self._top_level = query['top_level']
     except KeyError:
         self._top_level = True
Пример #7
0
    def testGenerateQuery(self):
        """
        Tests equality query generator against a 'db' to make sure it is 
        generating the right queries
        """
        #generate a 'db' to test against
        rows = []
        for _ in xrange(1000):
            row_dict = {}
            for var in self.fields_to_gen:
                dist = self.dist_holder.dist_dict[var]
                v = dist.generate(row_dict)
                if var != sv.VARS.DOB:
                    row_dict[var] = sv.VAR_CONVERTERS[var].to_csv(v)
                else:
                    row_dict[var] = v
            rows.append(row_dict)

        #generate queries
        query_batches = self.generator.produce_query_batches()
        queries = []
        for query_batch in query_batches:
            queries += query_batch.produce_queries()

        #check queries against 'db' to make sure they match within a factor
        #of two
        count = 0
        fail_msg = ''
        for q in queries:
            if count % 3 == 0:
                fail_count = 0
            count += 1
            if q[qs.QRY_SUBCAT] == 'range':
                minin = q[qs.QRY_LBOUND]
                max = q[qs.QRY_UBOUND]
                val = (minin, max)
                x = lambda y: y >= minin and y <= max
            elif q[qs.QRY_SUBCAT] == 'greater':
                val = q[qs.QRY_VALUE]
                x = lambda y: y >= val
                val = str(val)
            else:
                val = q[qs.QRY_VALUE]
                x = lambda y: y <= val
                val = str(val)
            count_match = len([
                row for row in rows
                if x(row[sv.sql_name_to_enum(q[qs.QRY_FIELD])])
            ])
            msg = 'Query %d was: \n' \
                  'sub_cat: %s\n'\
                  'field: %s\n'\
                  'type: %s\n'\
                  'r_lower: %d\n'\
                  'r_upper: %d\n'\
                  'count: %d\n'\
                  'value: %s\n' % (q[qs.QRY_QID], q[qs.QRY_SUBCAT],
                                      q[qs.QRY_FIELD], q[qs.QRY_SUBCAT],
                                      q[qs.QRY_LRSS], q[qs.QRY_URSS],
                                      count, val)
            if count_match > q[qs.QRY_URSS] * 10 or count_match < q[
                    qs.QRY_LRSS] / 10:
                fail_count += 1
                fail_msg = msg
        self.assertLessEqual(fail_count, 6, fail_msg)
Пример #8
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        #set up intitialization values
        class Object(object):
            pass

        self.learner_options = Object()
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        self.dummy_object = Object()
        pums_files = \
            [("mock pums",
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
        names_files = \
            [('male_first_names.txt',
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt',
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt',
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 names_files)
        zipcode_files = \
            [('mock_zipcodes',
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        zipcode_dict = \
            learn_distributions.learn_zipcode_dists(self.learner_options,
                                                    self.dummy_logger,
                                                    zipcode_files)

        text_files = \
            [('mock_text',
              stringio.StringIO(mock_data_files.mock_text_files))]
        text_engine = \
            learn_distributions.train_text_engine(self.learner_options,
                                                  self.dummy_logger,
                                                  text_files)
        streets_files = \
            [('mock street file',
              stringio.StringIO(mock_data_files.mock_street_names))]
        address_dict = \
                learn_distributions.learn_street_address_dists(self.learner_options,
                                                               self.dummy_logger,
                                                               streets_files)
        self.dist_holder = \
            learn_distributions.make_distribution_holder(self.learner_options,
                                                         self.dummy_logger,
                                                         pums_dict,
                                                         names_dict,
                                                         zipcode_dict,
                                                         address_dict,
                                                         text_engine)
        self.fields_to_gen = [
            sv.VARS.SEX, sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME
        ]
        sub_cat = 'eq'
        fields = [sv.sql_name_to_enum('fname'), sv.sql_name_to_enum('lname')]
        dists1 = [
            self.dist_holder.dist_dict[sv.VARS.FIRST_NAME],
            self.dist_holder.dist_dict[sv.VARS.LAST_NAME]
        ]
        other_fields = ['no_queries', 'r_lower', 'r_upper']
        other_cols = [[5, 1, 10], [5, 11, 100]]
        self.generator = eqg.EqualityQueryGenerator('EQ', sub_cat, ["LL"],
                                                    dists1, fields, 1000, 100,
                                                    other_fields, other_cols)
Пример #9
0
    def testGenerateQuery(self):
        """
        Tests equality query generator against a 'db' to make sure it is 
        generating the right queries
        """
        #generate a 'db' to test against
        rows = []
        for _ in xrange(1000):
            row_dict = {}
            for var in self.fields_to_gen:
                dist = self.dist_holder.dist_dict[var]
                v = dist.generate(row_dict)
                row_dict[var] = v
            rows.append(row_dict)

        #generate queries
        query_batches = self.generator.produce_query_batches()
        query = []
        for query_batch in query_batches:
            query += query_batch.produce_queries()

        q_dist1 = 0
        q_dist2 = 0

        #check queries against 'db' to make sure they match within a factor
        #of two
        working_queries = 0
        non_working_queries = []
        for q in query:
            self.assertEqual('', q[qs.QRY_SUBCAT], self.seed_msg)
            if q[qs.QRY_FIELD] == 'fname':
                q_dist1 += 1
            elif q[qs.QRY_FIELD] == 'lname':
                q_dist2 += 1
            count_match = len([
                x for x in rows
                if x[sv.sql_name_to_enum(q[qs.QRY_FIELD])] == q[qs.QRY_VALUE]
            ])
            msg = 'Query %d was: \n' \
                  'sub_cat: %s\n'\
                  'field: %s\n'\
                  'r_lower: %d\n'\
                  'r_upper: %d\n'\
                  'value: %s\n' % (q[qs.QRY_QID], q[qs.QRY_SUBCAT],
                                   q[qs.QRY_FIELD], q[qs.QRY_LRSS],
                                   q[qs.QRY_URSS], q[qs.QRY_VALUE])
            if count_match <= q[qs.QRY_URSS] * 2 and count_match >= q[
                    qs.QRY_LRSS] / 2:
                working_queries += 1
            else:
                non_working_queries.append(msg)

        fail_msg = ''
        for msg in non_working_queries[:3]:
            fail_msg += msg

        self.assertGreaterEqual(working_queries, 10, fail_msg)
        #check to see each field had the correct number of queries
        #ideally this number would be greater than 5 (the requested amount)
        #but because the distribution used for unit testing is so small
        #there is a greater margin of error at this scale
        self.assertGreaterEqual(q_dist1, 4, self.seed_msg)
        self.assertGreaterEqual(q_dist2, 4, self.seed_msg)
Пример #10
0
    def testGenerateQuery(self):
        """
        Tests threshold query generator against a 'db' to make sure it is 
        generating the right queries
        """
        #generate a 'db' to test against
        rows = []
        for x in xrange(1000):
            row_dict = {sv.VARS.ID: x}
            for var in self.fields_to_gen:
                dist = self.dist_holder.dist_dict[var]
                v = dist.generate(row_dict)
                row_dict[var] = sv.VAR_CONVERTERS[var].to_agg_fmt(v)
            rows.append(row_dict)
        #generate queries
        query_batches = self.generator.produce_query_batches()
        query_value_sets = []
        for query_batch in query_batches:
            queries = query_batch.produce_queries()
            for query in queries:
                for (a, b, c) in itertools.permutations(range(6), 3):
                    query_value_sets.append({
                        'first_clause':
                        query['sub_queries'][0][a][qs.QRY_VALUE],
                        'first_clause_field':
                        query['sub_queries'][0][a][qs.QRY_FIELD],
                        'second_clause':
                        query['sub_queries'][0][b][qs.QRY_VALUE],
                        'second_clause_field':
                        query['sub_queries'][0][b][qs.QRY_FIELD],
                        'third_clause':
                        query['sub_queries'][0][c][qs.QRY_VALUE],
                        'third_clause_field':
                        query['sub_queries'][0][c][qs.QRY_FIELD],
                        'r_lower':
                        query['r_lower'],
                        'r_upper':
                        query['r_upper'],
                        'sftm_lower':
                        query['ftm_lower'],
                        'sftm_upper':
                        query['ftm_upper']
                    })

        #check to see right number of queries generated
        self.assertEqual(len(query_value_sets), 2400, self.seed_msg)

        #check queries against 'db' to make sure they match within a factor
        #of two
        working_queries = 0
        non_working_queries = []
        id = sv.VARS.ID
        for q in query_value_sets:
            first_field = sv.sql_name_to_enum(q['first_clause_field'])
            second_field = sv.sql_name_to_enum(q['second_clause_field'])
            third_field = sv.sql_name_to_enum(q['third_clause_field'])
            ft = [x[id] for x in rows if x[first_field] == q['first_clause']]
            st = [x[id] for x in rows if x[second_field] == q['second_clause']]
            tt = [x[id] for x in rows if x[third_field] == q['third_clause']]
            matching_ids_set = set()
            for m_set in itertools.combinations([ft, st, tt], 2):
                matching_ids_set.update(
                    reduce(set.intersection, [set(x) for x in m_set]))
            count_match = len(matching_ids_set)
            total_match = len(ft) + len(st)
            msg = 'Query was\n'\
                  'where: %s=%s AND %s=%s\n'\
                  'ftm_lower: %d\n'\
                  'ftm_lower: %d\n'\
                  'r_lower: %d\n'\
                  'r_upper: %d\n'\
                  'sftm_match: %d\n'\
                  'count_match: %d\n'\
                  '\n' % (q['first_clause_field'], q['first_clause'],
                                         q['second_clause_field'], q['second_clause'],
                                         q['sftm_lower'], q['sftm_upper'], q['r_lower'],
                                         q['r_upper'], total_match,count_match)
            if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2 and\
               total_match <= q['sftm_upper']*2 and total_match >= q['sftm_lower']/2:
                working_queries += 1
            else:
                non_working_queries.append(msg)
        fail_msg = ''
        for msg in non_working_queries[:3]:
            fail_msg += msg
        self.assertGreaterEqual(working_queries, 10, fail_msg)
Пример #11
0
    def testGenerateQuery(self):
        """
        Tests or query generator against a 'db' to make sure it is 
        generating the right queries
        """
        #generate a 'db' to test against
        rows = []
        for _ in xrange(1000):
            row_dict = {}
            for var in self.fields_to_gen:
                dist = self.dist_holder.dist_dict[var]
                v = dist.generate(row_dict)
                row_dict[var] = sv.VAR_CONVERTERS[var].to_agg_fmt(v)
            rows.append(row_dict)
        #generate queries
        query_batches = self.generator.produce_query_batches()
        query_value_sets = []
        for query_batch in query_batches:
            queries = query_batch.produce_queries()
            for query in queries:
                for (a, b) in itertools.permutations(range(6), 2):
                    query_value_sets.append({
                        'first_clause':
                        query['sub_queries'][0][a][qs.QRY_VALUE],
                        'first_clause_field':
                        query['sub_queries'][0][a][qs.QRY_FIELD],
                        'second_clause':
                        query['sub_queries'][0][b][qs.QRY_VALUE],
                        'second_clause_field':
                        query['sub_queries'][0][b][qs.QRY_FIELD],
                        'r_lower':
                        query['r_lower'],
                        'r_upper':
                        query['r_upper'],
                        'stm_lower':
                        query['ftm_lower'],
                        'stm_upper':
                        query['ftm_upper']
                    })
        #check to see right number of queries generated
        self.assertEqual(len(query_value_sets), 600, self.seed_msg)

        #check queries against 'db' to make sure they match within a factor
        #of two
        working_queries = 0
        distribution_ineligible = 0
        non_working_queries = []
        for q in query_value_sets:
            first_field = sv.sql_name_to_enum(q['first_clause_field'])
            second_field = sv.sql_name_to_enum(q['second_clause_field'])
            ftm_match = len(
                [x for x in rows if x[first_field] == q['first_clause']])
            stm_match = len(
                [x for x in rows if x[second_field] == q['second_clause']])
            total_match = ftm_match + stm_match
            count_match = len([
                x for x in rows if x[first_field] == q['first_clause']
                or x[second_field] == q['second_clause']
            ])
            msg = 'Query was\n'\
                  'where: %s=%s AND %s=%s\n'\
                  'ftm_lower: %d\n'\
                  'ftm_lower: %d\n'\
                  'r_lower: %d\n'\
                  'r_upper: %d\n'\
                  'ftm_match: %d\n'\
                  'count_match: %d\n'\
                  '\n' % (q['first_clause_field'], q['first_clause'],
                                         q['second_clause_field'], q['second_clause'],
                                         q['stm_lower'], q['stm_upper'], q['r_lower'],
                                         q['r_upper'], ftm_match,count_match)
            if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2 and\
               total_match <= q['stm_upper']*2 and total_match >= q['stm_lower']/2:
                working_queries += 1
            else:
                non_working_queries.append(msg)
        fail_msg = ''
        for msg in non_working_queries[:3]:
            fail_msg += msg
        self.assertGreaterEqual(working_queries, 10, fail_msg)
Пример #12
0
 def testGenerateQueryRanges(self):
     """
     Tests and ta2 query generator against a 'db' to make sure it is 
     generating the right queries does not include a fishing term
     """
     #generate a 'db' to test against
     rows = []
     for _ in xrange(1000): 
         row_dict = {}
         for var in self.fields_to_gen:
             dist = self.dist_holder.dist_dict[var]
             v = dist.generate(row_dict)
             row_dict[var] = v
         rows.append(row_dict)
     #generate queries
     query_batches = self.generator3.produce_query_batches()
     query_value_sets = []
     for query_batch in query_batches:
         queries = query_batch.produce_queries()
         for query in queries:
             query['sub_queries'] = list(itertools.chain.from_iterable(query['sub_queries']))
             for (a, b) in itertools.permutations(range(0,6), 2):
                 try:
                     value = query['sub_queries'][a][qs.QRY_VALUE]
                     lower = query['sub_queries'][b][qs.QRY_LBOUND]
                     upper = query['sub_queries'][b][qs.QRY_UBOUND]
                 except:
                     continue                   
                 query_value_sets.append({ 
                  'first_clause' : value,
                  'first_clause_field' : query['sub_queries'][a][qs.QRY_FIELD],
                  'second_clause_lower' : lower,
                  'second_clause_upper' : upper,
                  'second_clause_field' : query['sub_queries'][b][qs.QRY_FIELD],
                  'r_lower' : query['r_lower'],
                  'r_upper' : query['r_upper'],
                  'range_type' : query[qs.QRY_SUBCAT]}) 
     #check to see right number of queries generated
     self.assertEqual(len(query_value_sets), 90, self.seed_msg)
 
     
     #check queries against 'db' to make sure they match within a factor 
     #of two
     working_queries = 0
     non_working_queries = []
     for q in query_value_sets:
         one_var = sv.sql_name_to_enum(q['first_clause_field'])
         two_var = sv.sql_name_to_enum(q['second_clause_field'])
         comp = lambda x,y,z : x >= y and x <= z
         count_match = len([x for x in rows if sv.VAR_CONVERTERS[one_var].to_agg_fmt(x[sv.sql_name_to_enum(
                                                 q['first_clause_field'])]) == q['first_clause'].upper() 
                                           and comp(sv.VAR_CONVERTERS[two_var].to_agg_fmt(x[sv.sql_name_to_enum(
                                                    q['second_clause_field'])]), 
                                                    q['second_clause_lower'].upper(),
                                                    q['second_clause_upper'].upper())])
         msg = 'Query was\n'\
               'where: %s=%s AND %s BETWEEN %s AND %s\n'\
               'r_lower: %d\n'\
               'r_upper: %d\n'\
               'count_match: %d\n'\
               '\n' % (q['first_clause_field'], q['first_clause'],
                                      q['second_clause_field'], q['second_clause_lower'],
                                      q['second_clause_upper'], q['r_lower'],
                                      q['r_upper'],count_match)
         if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2:
                 working_queries+=1
         else:
             non_working_queries.append(msg)
     fail_msg = ''
     for msg in non_working_queries[:3]:
         fail_msg += msg      
     self.assertGreaterEqual(working_queries, 10, fail_msg)
     
     
     
Пример #13
0
    def _generate_full_queries(self, dist, q):
        '''
        Generates queries of the form ./node1/node2/LEAF
        '''
        query_dicts = []
        for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
            self.__count += 1
            LOGGER.info('P11: Created %d out of %d queries' % \
                        (self.__count, self.__total))
            r_lower_total = q[qs.QRY_LRSS] / self.__db_size
            r_upper_total = q[qs.QRY_URSS] / self.__db_size
            branch_r_lower = pow(r_lower_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))
            branch_r_upper = pow(r_upper_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))

            tags = []
            for level in xrange(xg.XML_DEPTH - 1):
                tags.append(
                    dist.generate_node_pdf(level, branch_r_lower,
                                           branch_r_upper))
            tag_string = ''
            for tag in tags:
                tag_string += "/%s" % (tag)
            (field,
             value) = self._create_equality_leaf(dist, branch_r_lower,
                                                 branch_r_upper)

            value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
            try:
                value = value.replace('\'', '\'\'')
            except TypeError:
                pass
            except AttributeError:
                pass
            if field in ['foo', 'age', 'income']:
                where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field,
                                                             value)
            else:
                where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string,
                                                                 field, value)

            xpath = ['xml'] + tags
            xpath.append(field)
            qid = qids.query_id()
            if qid != qids.full_where_has_been_seen(qid, where):
                continue
            query_dicts.append({
                qs.QRY_ENUM: qs.CAT.P11_FULL,
                qs.QRY_QID: qid,
                qs.QRY_DBNUMRECORDS: self.__db_size,
                qs.QRY_DBRECORDSIZE: self.__row_width,
                qs.QRY_CAT: self.__cat,
                qs.QRY_SUBCAT: 'eq-full',
                qs.QRY_PERF: self.__perf,
                qs.QRY_WHERECLAUSE: where,
                qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
                qs.QRY_NEGATE: False,
                qs.QRY_FIELDTYPE: 'string',
                qs.QRY_LRSS: q[qs.QRY_LRSS],
                qs.QRY_URSS: q[qs.QRY_URSS],
                qs.QRY_VALUE: value,
                qs.QRY_XPATH: xpath
            })
        return aqb.XmlQueryBatch(
            query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO),
                                    1), True)
Пример #14
0
 def setUp(self):
     self.seed = int(time.time())
     self.seed_msg = "Random seed used for this test: %s" % self.seed
     self.longMessage = True
     spar_random.seed(self.seed)
     #set up intitialization values
     sub_cat = 'eq'
     self._dist1 = base_distribution.CompactIndependentDistribution()
     self._dist1.add('Letus', 1)
     self._dist1.add('arbey', 9)
     self._dist1.add('Amelia', 1)
     self._dist1.add('Anfrew', 9)
     self._dist1.add('Roberts', 1)
     self._dist1.add('Andreas', 9)
     self._dist1.add('Vacation', 1)
     self._dist1.add('Occulary', 9)
     self._dist1.add('Fuzzballs', 1)
     self._dist1.add('Divasmuch', 9)
     self._dist1.add('tenletters', 1)
     self._dist1.add('arehardtoo', 9)
     self._dist1.add('elevenseven', 1)
     self._dist1.add('harderthant', 9)
     self._dist2 = base_distribution.CompactIndependentDistribution()
     self._dist2.add('Smith', 1)
     self._dist2.add('Henry', 9)
     self._dist2.add('Roberts', 1)
     self._dist2.add('Andreas', 9)
     self._dist2.add('Vacation', 1)
     self._dist2.add('Occulary', 9)
     self._dist2.add('Fuzzballs', 1)
     self._dist2.add('Divasmuch', 9)
     self._dist2.add('tenletters', 1)
     self._dist2.add('arehardtoo', 9)
     self._dist2.add('elevenseven', 1)
     self._dist2.add('harderthant', 9)
     f = s.StringIO('''A a b my spout, when you tip me over hear me a a b.
             C cc e I'm a little teacup short and stout, here is my c cc e. 
     F fff h something about something about something of something f fff h.
     I jjjj l sentence generation is hard and annoying when doing i jjjj l. 
     M nnnnn p lets try to include more original sentenctes what m nnnnn p.
     Q rrrrrr s interesting trial of things and other things that q rrrrrr s. 
     U vvvvvvv y and need to include some other stuff to make u vvvvvvv y.'''
                    )
     self._dist3 = text_generator.TextGenerator((f, ))
     fields_non_notes = [
         sv.sql_name_to_enum('fname'),
         sv.sql_name_to_enum('lname')
     ]
     fields_notes = [sv.sql_name_to_enum('notes1')]
     dists = [self._dist1, self._dist2]
     other_fields = [
         'no_queries', 'r_lower', 'r_upper', 'keyword_len', 'type'
     ]
     other_cols_P6 = [[2, 1, 100, 5, 'initial-one'],
                      [2, 1, 100, 9, 'middle-one'],
                      [2, 1, 100, 5, 'final-one']]
     other_cols_P7 = [[2, 1, 200, 5, 'initial'], [2, 1, 250, 5, 'both'],
                      [2, 1, 200, 5, 'final']]
     self.P6_non_notes_generator = wqg.WildcardQueryGenerator(
         'P6', '', ["LL"], dists, fields_non_notes, 1000, 100, other_fields,
         other_cols_P6)
     self.P7_non_notes_generator = wqg.WildcardQueryGenerator(
         'P7', '', ["LL"], dists, fields_non_notes, 1000, 100, other_fields,
         other_cols_P7)
     self.P7_notes_generator = wqg.WildcardQueryGenerator(
         'P7', '', ["LL"], [self._dist3], fields_notes, 1000, 100,
         other_fields, other_cols_P7)