def testRSSUpper(self): qbs.set_result_set_size_range_upper('one', 10) qbs.set_result_set_size_range_upper('two', 100) qbs.set_result_set_size_range_upper('one', 1) qbs.set_result_set_size_range_upper('two', 1000) self.assertEqual(qbs.get_rss_upper('one'), 10) self.assertEqual(qbs.get_rss_upper('two'), 1000)
def refine_queries(self, agg_result): """ Takes in 'agg_result' which is the result from the aggregator for this BOQ. Selects which queries should be recorded in the results database. To discard a query it simply drops it and the associated result. """ range_seen = set() #selecting queries that match. queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] count = len(r[rdb.DBF_MATCHINGRECORDIDS]) #Weed out incorrect counts and previously seen range values if all([ qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count, qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count, q[qs.QRY_RANGEEXP] not in range_seen, r[qs.QRY_VALID] ]): queries.append((q, r)) range_seen.add(q[qs.QRY_RANGEEXP]) #capping at choose-num number of queries self.refined_queries_results = queries
def refine_queries(self, agg_result): """ Takes in 'agg_result' which is the result from the aggregator for this BOQ. Selects which queries should be recorded in the results database. To do this it creates a new list of associated selected queries and pairs them with their results. """ #selecting queries that match. queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] potential_queries = [] for (value, value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems(): count = len(value_result) if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\ qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count: (value, where) = self.format_value_and_where( sv.sql_name_to_enum(q[qs.QRY_FIELD]), value) q[qs.QRY_VALUE] = value q[qs.QRY_WHERECLAUSE] = where r[rdb.DBF_MATCHINGRECORDIDS] = value_result potential_queries.append((q, r)) if potential_queries: chosen_q = random.sample(potential_queries, 1)[0] chosen_q[0][qs.QRY_QID] = \ qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID], chosen_q[0][qs.QRY_WHERECLAUSE]) queries.append(chosen_q) #capping at choose-num number of queries self.refined_queries_results = queries
def __init__(self, query): self._qid = query[qs.QRY_QID] self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD]) self._alarmwords = set([query[qs.QRY_ALARMWORDONE], query[qs.QRY_ALARMWORDTWO]]) self._alarmword_distance = query[qs.QRY_ALARMWORDDISTANCE] self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM]) self._count = 0
def __init__(self, query): ''' Initialize the needed class variables from the query ''' self._qid = query[qs.QRY_QID] self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD]) # try/except block is mostly for backwards compatability # with unit tests try: self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM]) except KeyError: self._process_cutoff = 100000 self._count = 0 #If the query is atomic (i.e. top level), we want to apply a limit #on what it can collect, otherwise we want no process limit in #effect try: self._top_level = query['top_level'] except KeyError: self._top_level = True
def refine_queries(self, agg_result): ''' Selects the queries that work given the chosen result set size, and once those are selected it correctly orders the results with those rows who the words are closer towards the front of the id list ''' queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] row_dist = r[qs.QRY_MATCHINGROWIDANDDISTANCES] count = len(row_dist) if all([ qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count, qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count, r[qs.QRY_VALID] ]): dist_dict = {} for (row_id, dist) in row_dist: try: dist_dict[dist].append(row_id) except KeyError: dist_dict[dist] = [row_id] ids = [] counts = [] for (dist, row_ids) in sorted(dist_dict.iteritems(), reverse=False): ids += row_ids counts.append(len(row_ids)) r[rdb.DBF_MATCHINGRECORDIDS] = ids r[qs.QRY_MATCHINGRECORDCOUNTS] = '|'.join( map(str, sorted(counts, reverse=False))) queries.append((q, r)) #capping at choose-num number of queries self.refined_queries_results = queries
def refine_queries(self, agg_result): """ Takes in 'agg_result' which is the result from the aggregator for this BOQ. Selects which queries should be recorded in the results database. To do this it creates a new list of associated selected queries and pairs them with their results. """ #selecting queries that match. queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] count = len(r[rdb.DBF_MATCHINGRECORDIDS]) if all([ qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count, qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count, r[qs.QRY_VALID] ]): queries.append((q, r)) #capping at choose-num number of queries self.refined_queries_results = queries
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries != None: self.refined_queries_results = refined_queries for (q, r) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) self._print_query(q, query_file_handle) try: q[qs.QRY_PERF].remove('IBM1') except ValueError: pass q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ q[qs.QRY_WHERECLAUSE] + " DESC" q[qs.QRY_ENUM] = qs.CAT.P9_EQ q[qs.QRY_CAT] = 'P9' q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), q[qs.QRY_WHERECLAUSE]) r[qs.QRY_QID] = q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) self._print_query(q, query_file_handle) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = sorted( clause_q_r, key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS])) #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] comp_q_refined = False for clause_set in itertools.combinations( clause_q_r, num_clauses): if comp_q_refined == True: continue clause_list = list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] if len(values) != len( set(values)) or values in seen_where_group: continue seen_where_group.append(values) #check to see if it is working #if stfm doesn't match, don't bother continuing stfm = 0 for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] + 1): (_, r) = clause_list[offset] stfm += len(r[rdb.DBF_MATCHINGRECORDIDS]) if not all([ stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]), stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #if stfm does match, calculate the set intersection matching_ids_set = set() for m_set in itertools.combinations( clause_list, comp_q[qs.QRY_M]): matching_ids_set.update( reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in m_set ])) count = len(matching_ids_set) #check overall compliance if not all([ count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue comp_q_refined = True refined_total += 1 ##PROCESSING THE WORKING CLAUSE_LIST working_clauses = clause_list whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] where = ", ".join(whereclauses) where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M], comp_q[qs.QRY_N], where) #update query with chosen clauses comp_q[qs.QRY_WHERECLAUSE] = where comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] #have to create a list of counts of how many that match N terms, n-1 terms... #until m. Such of the form 34 | 384 | 1094 records_matching_count = dict( zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1), [0] * comp_q[qs.QRY_N])) for id in matching_ids_set: matching_terms = [ 1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS] else 0 for clause in working_clauses ] term_matches = sum(matching_terms) records_matching_count[term_matches] += 1 matching_records_counts = sorted( records_matching_count.values(), reverse=True) #update the results dictionary with the new calculated values comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts #make sure duplicate queries (and their atomic sub_components) have the same qids comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #write queries to the results database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) try: comp_q[qs.QRY_PERF].remove('IBM1') except ValueError: pass comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ comp_q[qs.QRY_WHERECLAUSE] + " DESC" comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ comp_q[qs.QRY_CAT] = 'P9' comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) refined_queries.append((comp_q, comp_q_results)) logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if comp_q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], stfm, count)) self.refined_queries_results = refined_queries
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries for (comp_q, comp_q_results) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] num_clauses = comp_q[qs.QRY_NUMCLAUSES] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r if len(r[rdb.DBF_MATCHINGRECORDIDS]) <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])] if len(clause_q_r) < num_clauses: continue #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause_set in itertools.combinations( clause_q_r, num_clauses): #query has already been refined if q_refined == True: continue clause_list = list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] #there are duplicate values or this where has already been seen if len(values)!=len(set(values)) or\ values in seen_where_group: continue seen_where_group.append(values) #check conditions matching_ids_set = reduce(set.union, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) all_match = sum( map(len, [ r[rdb.DBF_MATCHINGRECORDIDS] for (_, r) in clause_list ])) if not all([ count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), all_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), all_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 working_clauses = clause_list #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " OR ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_SUMRECORDSMATCHINGEACHTERM] = all_match comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match #make sure duplicate queries (and their atomic sub_components) have the same qids comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #create result objects and write to ground truth database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) refined_queries.append((comp_q, comp_q_results)) #print query self._print_query(comp_q, query_file_handle) #make where clause, update and with chosen queries and the aggregator results #with the chosen results logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], all_match, count)) self.refined_queries_results = refined_queries
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries else: refined_queries = [] refined_total = 0 for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] try: num_clauses = comp_q[qs.QRY_NUMCLAUSES] except KeyError: num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r if r[qs.QRY_VALID]] #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause in clause_q_r: for clause_set in itertools.combinations( clause_q_r, num_clauses - 1): if q_refined == True: continue clause_list = [clause] + list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] if len(values) != len( set(values)) or values in seen_where_group: continue seen_where_group.append(values) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) P2_cats = [ q for ((q, _), _) in clause_list if q[qs.QRY_CAT] == 'P2' ] if not all([ count >= qbs.get_rss_lower( comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), len(P2_cats) <= 1 ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 working_clauses = clause_list #reorder clauses re_ordered_clauses = [] last_clause = None for ((q, b), r) in working_clauses: if q[qs.QRY_CAT] == 'P2': last_clause = ((q, b), r) else: re_ordered_clauses.append(((q, b), r)) if last_clause: re_ordered_clauses.append(last_clause) working_clauses = re_ordered_clauses #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in working_clauses ]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match refined_queries.append((comp_q, comp_q_results)) #make where clause, update and with chosen queries and the aggregator results #with the chosen results logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, ftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], ftm_match, count)) for (q, r) in refined_queries: q[qs.QRY_QID] = qids.full_where_has_been_seen( q[qs.QRY_QID], q[qs.QRY_WHERECLAUSE]) r[qs.QRY_QID] = q[qs.QRY_QID] for (sub_q, sub_r) in zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #capping at choose-num number of queries self.refined_queries_results = refined_queries #create result objects and write to ground truth database for (q, r) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table(q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) #writing queries in sql format to file for (q, _) in self.refined_queries_results: if q != None: self._print_query(q, query_file_handle)
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries for (comp_q, comp_q_results) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] try: num_clauses = comp_q[qs.QRY_NUMCLAUSES] except KeyError: num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = sorted( clause_q_r, key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS])) #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause in clause_q_r: #don't need to check permuations if ftm doesn't match if q_refined == True: continue ftm_match = len(clause[1][rdb.DBF_MATCHINGRECORDIDS]) if not all([ ftm_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), ftm_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #alright ftm matches, let's check the rest of the clauses for clause_set in itertools.combinations( clause_q_r, num_clauses - 1): #query has already been refined if q_refined == True: continue clause_list = [clause] + list(clause_set) #check to see if any of the clauses or their fields are the same #if so we know the intersection is one we are not interested in values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] fields = [ q[qs.QRY_FIELD] for ((q, _), _) in clause_list ] #there are duplicate values or this where has already been seen if len(values)!=len(set(values)) or\ len(fields)!=len(set(fields)) or\ values in seen_where_group: continue seen_where_group.append(values) #check conditions matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) if not all([ count >= qbs.get_rss_lower( comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), ftm_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), ftm_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 #reorder clauses working_clauses = clause_list reordered_clauses = working_clauses[:1] working_clauses.remove(reordered_clauses[0]) cumulative_set = set( reordered_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) while len(working_clauses) > 0: next_clause = working_clauses[0] current_set = cumulative_set.intersection( working_clauses[0][1][ rdb.DBF_MATCHINGRECORDIDS]) for clauses in working_clauses: potential_set = cumulative_set.intersection( clauses[1][rdb.DBF_MATCHINGRECORDIDS]) if len(potential_set) < len(current_set): next_clause = clauses current_set = potential_set working_clauses.remove(next_clause) reordered_clauses.append(next_clause) cumulative_set = current_set working_clauses = reordered_clauses #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in working_clauses ]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match #get the id's lined up comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[ qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #write the results to the results database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) refined_queries.append((comp_q, comp_q_results)) logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION qid = %d, where_clause = %s, ftm = %d, rss = %d" % (comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE], ftm_match, count)) #capping at choose-num number of queries self.refined_queries_results = refined_queries