Пример #1
0
    def _equality_queries(self, field, dist, pdf_lower, pdf_upper):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
            value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {})
            qid = qids.query_id()
            (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(field, value)                       
            query_dicts.append({qs.QRY_ENUM : qs.CAT.EQ, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'EQ',
                                qs.QRY_SUBCAT : '', 
                                qs.QRY_WHERECLAUSE : where,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_VALUE : value})
    
        return aqb.EqualityQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)  
Пример #2
0
    def _string_equality_queries(self, field, dist, pdf_lower, pdf_upper):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for _ in xrange(CLAUSE_OVER_GENERATION_RATIO):
            value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {})
            qid = qids.query_id()
            where = 'SUBSTR(%s,1,9) = \'\'%s\'\'' % (sv.sql_info[field][0], value[:9])
            query_dicts.append({qs.QRY_ENUM : qs.CAT.P7_FINAL, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'P7',
                                qs.QRY_SUBCAT : 'final', 
                                qs.QRY_WHERECLAUSE : where,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_KEYWORDLEN : 9,
                                qs.QRY_SEARCHFOR : value[:9],
                                qs.QRY_SEARCHDELIMNUM : 1 })
    
        return aqb.WildcardQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)     
Пример #3
0
 def compound_eq(self, q):
     
     sub_bobs = []
     pdf_lower_total = q[qs.QRY_LRSS]/self.__db_size
     pdf_upper_total = q[qs.QRY_URSS]/self.__db_size
     pdf_lower = pow(pdf_lower_total,1.0/len(self.__dists))
     pdf_upper = pow(pdf_upper_total,1.0/len(self.__dists))
     for (f, dist) in self.__dists.iteritems(): 
         if f in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]:
             sub_bobs.append(self._string_equality_queries(f, dist, pdf_lower, pdf_upper))
         else:
             sub_bobs.append(self._equality_queries(f, dist, pdf_lower, pdf_upper))
         
     return {qs.QRY_ENUM : qs.CAT.P1_EQ_AND,
             qs.QRY_QID : qids.query_id(),
             qs.QRY_DBNUMRECORDS : self.__db_size,
             qs.QRY_DBRECORDSIZE : self.__row_width, 
             qs.QRY_PERF : self.__perf,
             qs.QRY_CAT : self.__cat,
             qs.QRY_SUBCAT : 'eq-and' if len(self.__dists) > 1 else '', 
             qs.QRY_WHERECLAUSE : '',
             qs.QRY_NEGATE : False,
             qs.QRY_LRSS : q['r_lower'],
             qs.QRY_URSS : q['r_upper'],
             qs.QRY_NUMCLAUSES :  len(self.__dists), 
             qs.QRY_NUMTERMSPERCLAUSE : 1, 
             qs.QRY_FTMLOWER : 1,
             qs.QRY_FTMUPPER : self.__db_size, 
             qs.QRY_SUBBOBS : sub_bobs}
Пример #4
0
def process_atomic_wheres(atomic_where_clauses, resultdb, record_size, num_records, qid):
        for atomic_where_clause in atomic_where_clauses:
            atomic_where_clause = atomic_where_clause.strip()
            cmd = "select aqid from atomic_queries where where_clause = \"" + atomic_where_clause + "\""
            resultdb._execute(cmd)
            aqid = resultdb._fetchall()
            if aqid == []:
                afield = atomic_where_clause.split( )[0].strip();
                afield_type = FIELD_TO_TYPE[afield]
                atomic_where_aqid = qids.query_id()
                cmd = "insert into atomic_queries (where_clause, db_record_size,"+\
                " db_num_records, category, field_type, field, num_matching_records, aqid) values "+\
                "(\""+str(atomic_where_clause)+"\", "+str(record_size)+", "+str(num_records)+", \"EQ\",\""+str(afield_type)+"\",\""+str(afield)+\
                "\", 1, "+str(atomic_where_aqid)+")"
                print cmd
                resultdb._execute(cmd)
                atomic_ids = resultdb._fetchall()
            else:
                atomic_where_aqid = aqid[0][0]
            #rebuild mapping between full and atomic queries
            cmd = "insert into full_to_atomic_junction (full_row_id, atomic_row_id) values ("+\
            str(qid)+","+str(atomic_where_aqid)+")"
            print cmd
            resultdb._execute(cmd)
            null = resultdb._fetchall()        
Пример #5
0
    def _generate_queries(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """
        query_dicts = []
        for (field, dist) in self.__dists.iteritems():
            for q_template in self.__queries:
                query_dicts = []
                for count in xrange(q_template['no_queries']*\
                                    OVER_GENERATION_RATIO):
                    self.__count += 1
                    LOGGER.info('P9: Created %d out of %d queries' % \
                                (self.__count, self.__total))
                    lrss = (q_template[qs.QRY_LRSS]/self.__db_size)
                    urss = (q_template[qs.QRY_URSS]/self.__db_size)
                    (word_one, word_two) = dist.generate_alarmword(lrss, urss)
                    where = 'WORD_PROXIMITY(%s, \'\'%s\'\', \'\'%s\'\')' % \
                                        (sv.sql_info[field][0],
                                         word_one, word_two)
                    where_clause = "%s <= %d ORDER BY %s" % (where, q_template['distance'], where)
                    qid = qids.query_id()
                    if qid != qids.full_where_has_been_seen(qid,where_clause):
                        continue
                    query_dicts.append({ 
                           qs.QRY_ENUM : qs.CAT.P9_ALARM_WORDS,
                           qs.QRY_QID : qid,
                           qs.QRY_CAT : self.__cat, 
                           qs.QRY_SUBCAT : 'alarm-words', 
                           qs.QRY_DBNUMRECORDS : self.__db_size,
                           qs.QRY_DBRECORDSIZE : self.__row_width,
                           qs.QRY_PERF : self.__perf,
                           qs.QRY_FIELD: sv.sql_info[field][0],
                           qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                           qs.QRY_WHERECLAUSE : where_clause,
                           qs.QRY_LRSS : q_template[qs.QRY_LRSS],
                           qs.QRY_URSS : q_template[qs.QRY_URSS],
                           qs.QRY_ALARMWORDONE : word_one,
                           qs.QRY_ALARMWORDTWO : word_two,
                           qs.QRY_ALARMWORDDISTANCE : q_template['distance']})

                self.__bobs.append(aqb.AlarmQueryBatch(query_dicts, count,
                                                       int(count/OVER_GENERATION_RATIO),
                                                       True))        
        return self.__bobs   

    
            
            
            
            
            
            
            
            
      

            
Пример #6
0
    def _generate_range_queries(self, field, dist, q):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        num_queries = 0
        for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
            num_queries = count
            self.__count += 1
            LOGGER.info('P2: Created %d out of %d queries' %
                        (self.__count, self.__total))
            r_lower_cdf = q['rss_lower_cdf']
            r_upper_cdf = q['rss_upper_cdf']
            (lower, upper) = dist.generate_double_range(r_lower_cdf,
                                                        r_upper_cdf,
                                                        db_size=self.__db_size)
            qid = qids.query_id()
            if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
                where_clause = '%s BETWEEN %s AND %s' % \
                                    (sv.sql_info[field][0],
                                    sv.VAR_CONVERTERS[field].to_csv(lower),
                                    sv.VAR_CONVERTERS[field].to_csv(upper))
            else:
                where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % \
                                    (sv.sql_info[field][0],
                                    sv.VAR_CONVERTERS[field].to_csv(lower),
                                    sv.VAR_CONVERTERS[field].to_csv(upper))
            if qid != qids.full_where_has_been_seen(qid, where_clause):
                continue
            query_dicts.append({
                qs.QRY_ENUM: qs.CAT.P2_RANGE,
                qs.QRY_QID: qid,
                qs.QRY_DBNUMRECORDS: self.__db_size,
                qs.QRY_DBRECORDSIZE: self.__row_width,
                qs.QRY_PERF: self.__perf,
                qs.QRY_CAT: self.__cat,
                qs.QRY_SUBCAT: 'range',
                qs.QRY_WHERECLAUSE: where_clause,
                qs.QRY_FIELD: sv.sql_info[field][0],
                qs.QRY_NEGATE: False,
                qs.QRY_FIELDTYPE: sv.sql_info[field][1],
                qs.QRY_LRSS: q[qs.QRY_LRSS],
                qs.QRY_URSS: q[qs.QRY_URSS],
                qs.QRY_LBOUND: lower,
                qs.QRY_UBOUND: upper,
                qs.QRY_RANGE: 0
            })

        return aqb.RangeQueryBatch(
            query_dicts, num_queries,
            int((num_queries + 1) / OVER_GENERATION_RATIO), True)
Пример #7
0
 def _generate_short_queries(self, dist, q):
     '''
     Generates queries of the form .//LEAF
     '''
     query_dicts = []
     query_count = 0
     for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
         self.__count += 1
         query_cout = count
         LOGGER.info('P11: Created %d out of %d queries' % \
                     (self.__count, self.__total))
         r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         (field, value) = self._create_equality_leaf(dist, r_lower, r_upper)
         value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
         try:
             value = value.replace('\'', '\'\'')
         except TypeError:
             pass
         except AttributeError:
             pass
         if field in ['foo', 'age', 'income']:
             where = "xml_value(xml,\'//%s\', %s)" % (field, value)
         else:
             where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value)
         xpath = field
         qid = qids.query_id()
         if qid != qids.full_where_has_been_seen(qid, where):
             continue
         query_dicts.append({
             qs.QRY_ENUM: qs.CAT.P11_SHORT,
             qs.QRY_QID: qid,
             qs.QRY_DBNUMRECORDS: self.__db_size,
             qs.QRY_DBRECORDSIZE: self.__row_width,
             qs.QRY_PERF: self.__perf,
             qs.QRY_CAT: self.__cat,
             qs.QRY_SUBCAT: 'eq-double-slash',
             qs.QRY_WHERECLAUSE: where,
             qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
             qs.QRY_NEGATE: False,
             qs.QRY_FIELDTYPE: 'string',
             qs.QRY_LRSS: q[qs.QRY_LRSS],
             qs.QRY_URSS: q[qs.QRY_URSS],
             qs.QRY_VALUE: value,
             qs.QRY_XPATH: xpath
         })
     return aqb.XmlQueryBatch(
         query_dicts, query_count,
         max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
Пример #8
0
    def _than_queries(self, field, dist, pdf_lower, pdf_upper, range_type):
        query_dicts = []
        for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
            #generate the range specific aspects of the queries
            if range_type == 'greater':
                value = dist.generate_greater_than(pdf_lower, pdf_upper,
                                                   db_size = self.__db_size)
                enum = qs.CAT.P2_GREATER
                tail = ''
                comp = '>='
            else:
                value = dist.generate_less_than(pdf_lower, pdf_upper,
                                                db_size = self.__db_size)
                enum = qs.CAT.P2_LESS
                tail = TAIL
                comp = '<='
            #generate the where clauses    
            if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
                where_clause = '%s %s %s' % (sv.sql_info[field][0], comp,
                                             sv.VAR_CONVERTERS[field].to_csv(value))
            elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]:
                
                where_clause = 'SUBSTR(%s,1,9) %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, 
                                                 sv.VAR_CONVERTERS[field].to_csv(value)[:9])
                value = value + tail
            else:
                where_clause = '%s %s \'\'%s\'\'' % (sv.sql_info[field][0], comp,
                                             sv.VAR_CONVERTERS[field].to_csv(value))
            
            qid = qids.query_id()
            query_dicts.append({qs.QRY_ENUM : enum, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'P2',
                                qs.QRY_SUBCAT : range_type,
                                qs.QRY_WHERECLAUSE : where_clause,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_VALUE : value,
                                qs.QRY_RANGE : 0
                                })
        return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)   

            
Пример #9
0
    def _equality_queries(self, field, dist):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for q in xrange(len(self.__queries)):
            query_dicts = []
            for count in xrange(self.__queries[q]['no_queries'] *
                                OVER_GENERATION_RATIO):
                self.__count += 1
                logger.info('EQ: Created %d out of %d queries' %
                            (self.__count, self.__total))
                r_lower_cdf = self.__queries[q]['rss_lower_cdf']
                r_upper_cdf = self.__queries[q]['rss_upper_cdf']
                value = self.__dists[field].generate_pdf(
                    r_lower_cdf, r_upper_cdf, {})
                qid = qids.query_id()
                (value,
                 where) = aqb.EqualityFishingQueryBatch.format_value_and_where(
                     field, value)
                if qid != qids.full_where_has_been_seen(qid, where):
                    continue
                query_dicts.append({
                    qs.QRY_ENUM: qs.CAT.EQ,
                    qs.QRY_QID: qid,
                    qs.QRY_DBNUMRECORDS: self.__db_size,
                    qs.QRY_DBRECORDSIZE: self.__row_width,
                    qs.QRY_PERF: self.__perf,
                    qs.QRY_CAT: self.__cat,
                    qs.QRY_SUBCAT: '',
                    qs.QRY_WHERECLAUSE: where,
                    qs.QRY_FIELD: sv.sql_info[field][0],
                    qs.QRY_NEGATE: False,
                    qs.QRY_FIELDTYPE: sv.sql_info[field][1],
                    qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS],
                    qs.QRY_URSS: self.__queries[q][qs.QRY_URSS],
                    qs.QRY_VALUE: value
                })

            self.__bobs.append(
                aqb.EqualityQueryBatch(
                    query_dicts, count, int(
                        (count + 1) / OVER_GENERATION_RATIO), True))
Пример #10
0
    def _equality_fishing_queries(self, f, dist):
        """
        Creates the logic for creating queries for fields with high entropy
        that cannot be generated before hand
        """
        query_dicts = []
        for q in xrange(len(self.__queries)):
            query_dicts = []
            for count in xrange(self.__queries[q]['no_queries'] *
                                OVER_GENERATION_RATIO):
                self.__count += 1
                logger.info('EQ: Created %d out of %d queries' %
                            (self.__count, self.__total))
                field = f
                r_lower_cdf = self.__queries[q]['rss_lower_cdf']
                r_upper_cdf = self.__queries[q]['rss_upper_cdf']
                (search_value, lbound, ubound, enum) = \
                             self._get_fishing_values(f, dist, r_lower_cdf, r_upper_cdf)

                query_dicts.append({
                    qs.QRY_ENUM: enum,
                    qs.QRY_QID: qids.query_id(),
                    qs.QRY_DBNUMRECORDS: self.__db_size,
                    qs.QRY_DBRECORDSIZE: self.__row_width,
                    qs.QRY_PERF: self.__perf,
                    qs.QRY_CAT: self.__cat,
                    qs.QRY_SUBCAT: '',
                    qs.QRY_WHERECLAUSE: '',
                    qs.QRY_FIELD: sv.sql_info[field][0],
                    qs.QRY_NEGATE: False,
                    qs.QRY_FIELDTYPE: sv.sql_info[field][1],
                    qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS],
                    qs.QRY_URSS: self.__queries[q][qs.QRY_URSS],
                    qs.QRY_VALUE: None,
                    qs.QRY_SEARCHFOR: search_value,
                    qs.QRY_LBOUND: lbound,
                    qs.QRY_UBOUND: ubound
                })

            self.__bobs.append(
                aqb.EqualityFishingQueryBatch(
                    query_dicts, count, int(
                        (count + 1) / OVER_GENERATION_RATIO), True))
 def make_query_template(self, q, bob):
     return {
         qs.QRY_ENUM: self.enum(),
         qs.QRY_QID: qids.query_id(),
         qs.QRY_DBNUMRECORDS: self._db_size,
         qs.QRY_DBRECORDSIZE: self._row_width,
         qs.QRY_PERF: self._perf,
         qs.QRY_CAT: self._cat,
         qs.QRY_SUBCAT: 'eq',
         qs.QRY_WHERECLAUSE: '',
         qs.QRY_NEGATE: False,
         qs.QRY_LRSS: q['r_lower'],
         qs.QRY_URSS: q['r_upper'],
         qs.QRY_FTMLOWER: q['tm_lower'],
         qs.QRY_FTMUPPER: q['tm_upper'],
         qs.QRY_N: q['n'],
         qs.QRY_M: q['m'],
         qs.QRY_SUBBOBS: bob
     }
 def make_equality_queries(self, field, value):
     '''
     creates equality query dictionaries
     '''
     (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(
         field, value)
     qid = qids.query_id()
     return {
         qs.QRY_ENUM: qs.CAT.EQ,
         qs.QRY_QID: qids.full_where_has_been_seen(qid, where),
         qs.QRY_DBNUMRECORDS: self._db_size,
         qs.QRY_DBRECORDSIZE: self._row_width,
         qs.QRY_CAT: 'EQ',
         qs.QRY_SUBCAT: '',
         qs.QRY_WHERECLAUSE: where,
         qs.QRY_FIELD: sv.sql_info[field][0],
         qs.QRY_NEGATE: False,
         qs.QRY_FIELDTYPE: sv.sql_info[field][1],
         qs.QRY_VALUE: value
     }
Пример #13
0
 def _range_queries(self, field, dist, pdf_lower, pdf_upper):
     query_dicts = []
     tail = 'ZZZZZZZZZZZZZ'
     for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
         (lower, upper) = dist.generate_double_range(pdf_lower, pdf_upper,
                                                         db_size = self.__db_size)
         qid = qids.query_id()
         if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
             where_clause = '%s BETWEEN %s AND %s' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower),
                                                  sv.VAR_CONVERTERS[field].to_csv(upper))
         elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]:
             where_clause = 'SUBSTR(%s,1,9) BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower)[:9],
                                                  sv.VAR_CONVERTERS[field].to_csv(upper)[:9])
             lower = sv.VAR_CONVERTERS[field].to_csv(lower)[:9]
             upper = sv.VAR_CONVERTERS[field].to_csv(upper)[:9]+TAIL
         else:
             where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower),
                                                  sv.VAR_CONVERTERS[field].to_csv(upper))
         query_dicts.append({qs.QRY_ENUM : qs.CAT.P2_RANGE, 
                             qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause),
                             qs.QRY_DBNUMRECORDS : self.__db_size,
                             qs.QRY_DBRECORDSIZE : self.__row_width, 
                             qs.QRY_PERF : self.__perf,
                             qs.QRY_CAT : 'P2',
                             qs.QRY_SUBCAT : 'range', 
                             qs.QRY_WHERECLAUSE : where_clause,
                             qs.QRY_FIELD : sv.sql_info[field][0],
                             qs.QRY_NEGATE : False,
                             qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                             qs.QRY_LBOUND : lower,
                             qs.QRY_UBOUND : upper,
                             qs.QRY_RANGE : 0
                             })
 
     return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                   1, False)  
Пример #14
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries != None:
            self.refined_queries_results = refined_queries
            for (q, r) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
                self._print_query(q, query_file_handle)

                try:
                    q[qs.QRY_PERF].remove('IBM1')
                except ValueError:
                    pass

                q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                        q[qs.QRY_WHERECLAUSE] + " DESC"
                q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                q[qs.QRY_CAT] = 'P9'
                q[qs.QRY_QID] = qids.full_where_has_been_seen(
                    qids.query_id(), q[qs.QRY_WHERECLAUSE])
                r[qs.QRY_QID] = q[qs.QRY_QID]
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                self._print_query(q, query_file_handle)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = sorted(
                    clause_q_r,
                    key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS]))
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                comp_q_refined = False
                for clause_set in itertools.combinations(
                        clause_q_r, num_clauses):
                    if comp_q_refined == True:
                        continue
                    clause_list = list(clause_set)
                    values = [
                        q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list
                    ]
                    if len(values) != len(
                            set(values)) or values in seen_where_group:
                        continue
                    seen_where_group.append(values)

                    #check to see if it is working
                    #if stfm doesn't match, don't bother continuing
                    stfm = 0
                    for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] +
                                         1):
                        (_, r) = clause_list[offset]
                        stfm += len(r[rdb.DBF_MATCHINGRECORDIDS])
                    if not all([
                            stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]),
                            stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #if stfm does match, calculate the set intersection
                    matching_ids_set = set()
                    for m_set in itertools.combinations(
                            clause_list, comp_q[qs.QRY_M]):
                        matching_ids_set.update(
                            reduce(set.intersection, [
                                set(r[rdb.DBF_MATCHINGRECORDIDS])
                                for (_, r) in m_set
                            ]))
                    count = len(matching_ids_set)

                    #check overall compliance
                    if not all([
                            count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]),
                            count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue

                    comp_q_refined = True
                    refined_total += 1
                    ##PROCESSING THE WORKING CLAUSE_LIST
                    working_clauses = clause_list
                    whereclauses = [
                        q[qs.QRY_WHERECLAUSE]
                        for ((q, _), _) in working_clauses
                    ]
                    where = ", ".join(whereclauses)
                    where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M],
                                                    comp_q[qs.QRY_N], where)
                    #update query with chosen clauses
                    comp_q[qs.QRY_WHERECLAUSE] = where
                    comp_q['sub_queries'] = [
                        q for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_SUBBOBS] = [
                        b for ((_, b), _) in working_clauses
                    ]

                    #have to create a list of counts of how many that match N terms, n-1 terms...
                    #until m. Such of the form 34 | 384 | 1094
                    records_matching_count = dict(
                        zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1),
                            [0] * comp_q[qs.QRY_N]))
                    for id in matching_ids_set:
                        matching_terms = [
                            1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS]
                            else 0 for clause in working_clauses
                        ]
                        term_matches = sum(matching_terms)
                        records_matching_count[term_matches] += 1
                    matching_records_counts = sorted(
                        records_matching_count.values(), reverse=True)
                    #update the results dictionary with the new calculated values
                    comp_q_results[qs.QRY_SUBRESULTS] = [
                        r for (_, r) in working_clauses
                    ]
                    comp_q_results[
                        rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                    comp_q_results[
                        qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts

                    #make sure duplicate queries (and their atomic sub_components) have the same qids
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    for (sub_q,
                         sub_r) in zip(comp_q['sub_queries'],
                                       comp_q_results[qs.QRY_SUBRESULTS]):
                        sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                            sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                        sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                    #write queries to the results database
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)

                    try:
                        comp_q[qs.QRY_PERF].remove('IBM1')
                    except ValueError:
                        pass

                    comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                                 comp_q[qs.QRY_WHERECLAUSE] + " DESC"
                    comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                    comp_q[qs.QRY_CAT] = 'P9'
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        qids.query_id(), comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)
                    refined_queries.append((comp_q, comp_q_results))
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if comp_q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], stfm, count))
        self.refined_queries_results = refined_queries
Пример #15
0
    def _generate_full_queries(self, dist, q):
        '''
        Generates queries of the form ./node1/node2/LEAF
        '''
        query_dicts = []
        for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
            self.__count += 1
            LOGGER.info('P11: Created %d out of %d queries' % \
                        (self.__count, self.__total))
            r_lower_total = q[qs.QRY_LRSS] / self.__db_size
            r_upper_total = q[qs.QRY_URSS] / self.__db_size
            branch_r_lower = pow(r_lower_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))
            branch_r_upper = pow(r_upper_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))

            tags = []
            for level in xrange(xg.XML_DEPTH - 1):
                tags.append(
                    dist.generate_node_pdf(level, branch_r_lower,
                                           branch_r_upper))
            tag_string = ''
            for tag in tags:
                tag_string += "/%s" % (tag)
            (field,
             value) = self._create_equality_leaf(dist, branch_r_lower,
                                                 branch_r_upper)

            value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
            try:
                value = value.replace('\'', '\'\'')
            except TypeError:
                pass
            except AttributeError:
                pass
            if field in ['foo', 'age', 'income']:
                where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field,
                                                             value)
            else:
                where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string,
                                                                 field, value)

            xpath = ['xml'] + tags
            xpath.append(field)
            qid = qids.query_id()
            if qid != qids.full_where_has_been_seen(qid, where):
                continue
            query_dicts.append({
                qs.QRY_ENUM: qs.CAT.P11_FULL,
                qs.QRY_QID: qid,
                qs.QRY_DBNUMRECORDS: self.__db_size,
                qs.QRY_DBRECORDSIZE: self.__row_width,
                qs.QRY_CAT: self.__cat,
                qs.QRY_SUBCAT: 'eq-full',
                qs.QRY_PERF: self.__perf,
                qs.QRY_WHERECLAUSE: where,
                qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
                qs.QRY_NEGATE: False,
                qs.QRY_FIELDTYPE: 'string',
                qs.QRY_LRSS: q[qs.QRY_LRSS],
                qs.QRY_URSS: q[qs.QRY_URSS],
                qs.QRY_VALUE: value,
                qs.QRY_XPATH: xpath
            })
        return aqb.XmlQueryBatch(
            query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO),
                                    1), True)
Пример #16
0
    def produce_query_batches(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        self.bobs = []
        query_dicts = []
        for f in xrange(len(self.__dists)):
            for q in xrange(len(self.__queries)):
                query_dicts = []
                for count in xrange(self.__queries[q]['no_queries'] *
                                    OVER_GENERATION_RATIO):
                    for r in xrange(self.__queries[q][qs.QRY_RANGEEXPL],
                                    self.__queries[q][qs.QRY_RANGEEXPU] + 1):
                        self.__count += 1
                        logger.debug('P2-foo: Created %d out of %d queries' %
                                     (self.__count, self.__total))
                        field = sv.sql_info[self.__fields[f]][0]
                        rss_lower = self.__queries[q][qs.QRY_LRSS]
                        rss_upper = self.__queries[q][qs.QRY_URSS]
                        rss_avg = (rss_lower + rss_upper) / 2
                        range = 2**r
                        if self.__queries[q][qs.QRY_TYPE] == 'range':
                            try:
                                (lower, upper) = self.__dists[
                                    self.__fields[f]].generate_two_sided(
                                        rss_avg, range, self.__db_size)
                            except bd.FooInputs:
                                (lower, upper) = (0, 0)
                            enum = qs.CAT.P2_RANGE_FOO
                            where_clause = '%s BETWEEN %d AND %d' % (
                                field, lower, upper)
                        else:
                            try:
                                lower = self.__dists[
                                    self.__fields[f]].generate_greater_than(
                                        rss_avg, self.__db_size)
                            except bd.FooInputs:
                                lower = 0
                            upper = 2**64 - 1
                            enum = qs.CAT.P2_GREATER_FOO
                            where_clause = '%s >= %d' % (field, lower)
                        qid = qids.query_id()
                        if qid != qids.full_where_has_been_seen(
                                qid, where_clause):
                            continue
                        query_dicts.append({
                            qs.QRY_ENUM:
                            enum,
                            qs.QRY_QID:
                            qid,
                            qs.QRY_CAT:
                            self.__cat,
                            qs.QRY_SUBCAT:
                            self.__queries[q][qs.QRY_TYPE],
                            qs.QRY_DBNUMRECORDS:
                            self.__db_size,
                            qs.QRY_DBRECORDSIZE:
                            self.__row_width,
                            qs.QRY_PERF:
                            self.__perf,
                            qs.QRY_WHERECLAUSE:
                            where_clause,
                            qs.QRY_FIELD:
                            field,
                            qs.QRY_FIELDTYPE:
                            sv.sql_info[self.__fields[f]][1],
                            qs.QRY_TYPE:
                            self.__queries[q][qs.QRY_TYPE],
                            qs.QRY_LRSS:
                            rss_lower,
                            qs.QRY_URSS:
                            rss_upper,
                            qs.QRY_RANGEEXP:
                            r,
                            qs.QRY_LBOUND:
                            lower,
                            qs.QRY_UBOUND:
                            upper,
                            qs.QRY_RANGE:
                            upper - lower
                        })

                self.bobs.append(
                    aqb.FooRangeQueryBatch(
                        query_dicts, len(query_dicts),
                        int(len(query_dicts) / OVER_GENERATION_RATIO), True))
        return self.bobs
Пример #17
0
    def produce_query_batches(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """
        self.bobs = []
        query_dicts = []
        for f in xrange(len(self.__dists)):
            for q in xrange(len(self.__queries)):
                query_dicts = []
                for count in xrange(self.__queries[q]['no_queries'] *
                                    OVER_GENERATION_RATIO):
                    self.__count += 1
                    logger.info('P3/P4: Created %d out of %d queries' %
                                (self.__count, self.__total))
                    field = self.__fields[f]
                    r_u_pdf = self.__queries[q]['rss_u_pdf']
                    r_l_pdf = self.__queries[q]['rss_l_pdf']
                    kw_len = self.__queries[q][qs.QRY_KEYWORDLEN]
                    if self.__queries[q][qs.QRY_TYPE] == 'word':
                        enum = qs.CAT.P3
                        value = self.__dists[field].generate_word(
                            kw_len, r_l_pdf, r_u_pdf)
                        where_clause = 'CONTAINED_IN(%s, \'\'%s\'\')' % (
                            sv.sql_info[field][0], value.replace(
                                '\'', "\\'").lower())
                    else:
                        enum = qs.CAT.P4
                        (value, word) = self.__dists[field].generate_antistem(
                            kw_len, r_l_pdf, r_u_pdf)
                        where_clause = 'CONTAINS_STEM(%s, \'\'%s\'\')' % (
                            sv.sql_info[field][0], word.replace('\'',
                                                                "\\'").lower())
                    try:
                        RSS = self.__queries[q][qs.QRY_RSS]
                        LRSS = int(self.__queries[q][qs.QRY_RSS] * 1.1)
                        URSS = int(self.__queries[q][qs.QRY_RSS] / 1.1)
                    except KeyError:
                        RSS = (self.__queries[q][qs.QRY_LRSS] +
                               self.__queries[q][qs.QRY_URSS]) / 2.0
                        LRSS = self.__queries[q][qs.QRY_LRSS]
                        URSS = self.__queries[q][qs.QRY_URSS]

                    qid = qids.query_id()
                    if qid != qids.full_where_has_been_seen(qid, where_clause):
                        continue
                    query_dicts.append({
                        qs.QRY_ENUM:
                        enum,
                        qs.QRY_QID:
                        qid,
                        qs.QRY_CAT:
                        self.__cat,
                        qs.QRY_SUBCAT:
                        '',
                        qs.QRY_DBNUMRECORDS:
                        self.__db_size,
                        qs.QRY_DBRECORDSIZE:
                        self.__row_width,
                        qs.QRY_PERF:
                        self.__perf,
                        qs.QRY_FIELD:
                        sv.sql_info[field][0],
                        qs.QRY_FIELDTYPE:
                        sv.sql_info[field][1],
                        qs.QRY_WHERECLAUSE:
                        where_clause,
                        qs.QRY_TYPE:
                        self.__queries[q][qs.QRY_TYPE],
                        qs.QRY_RSS:
                        RSS,
                        qs.QRY_LRSS:
                        LRSS,
                        qs.QRY_URSS:
                        URSS,
                        qs.QRY_KEYWORDLEN:
                        kw_len,
                        qs.QRY_SEARCHFOR:
                        value
                    })

                self.bobs.append(
                    aqb.KeywordQueryBatch(
                        query_dicts, count,
                        int((count + 1) / OVER_GENERATION_RATIO), True))
        return self.bobs