def __candidate_construction(query, row, query_pool, match_list, candidate_rate): # select set of queries matching X match_query = {} document = '' for term in match_list: document += alphnum(getElement(term, row).lower()) + ' ' for q, v in query_pool.iteritems(): match_query[q] = v for subq in q: if subq not in document: match_query.pop(q) break match_query[query] = query_pool[query] candidate_query = random.sample(match_query.keys(), int(len(match_query) * candidate_rate)) return match_query, candidate_query
def __candidate_construction(query, row, query_pool, match_term, candidate_rate): # select set of queries matching X match_query = {} document = '' for term in match_term: try: document += data_process.alphnum(eval(term).lower()) + ' ' except KeyError: continue for q, v in query_pool.iteritems(): match_query[q] = v for subq in q: if subq not in document: match_query.pop(q) break match_query[query] = query_pool[query] candidate_query = random.sample(match_query.keys(), int(len(match_query) * candidate_rate)) return match_query, candidate_query
def sota_sampler(query_pool, api, match_term, top_k, adjustment=1, samplenum=500): """ A method to crawl each document from a search engine's corpus in the same probability ------**Random sampling from a search engine's index** :param query_pool: A dict contains the queries and their benefits. {set(['yong','jun']):5} :param api: An implementation of simapi for specific api. :param match_term: Some fields for matching queries and returned document. :param top_k: Only top_k documents would be returned by api. :param adjustment: A paramters used to improve the probability of accepting a document :param samplenum: The size of the sample :return: A list of sample documents returned by api """ sample = [] query_cost = 0 params = api.getKwargs() query_pool_copy = copy.deepcopy(query_pool) matchlist = [] for m in match_term: matchlist.append(m.split('.')) while len(sample) < samplenum: query_cost += 1 curQuery = random.choice(query_pool.items()) params[api.getSearchTerm()] = '+'.join(curQuery[0]) result = api.callAPI(params=params) # choose one valid query if len(result) < top_k and len(result) > 0: # with prob of q/k if random.uniform(0, 1) <= len(result) / (top_k * 1.0): # choose one edge uniformly rint = random.randint(0, len(result) - 1) row = result[rint] document = '' for term in matchlist: document += alphnum(getElement(term, row).lower()) + ' ' # accept with prob of 1/freq # else continue with prob(1 - q/k) Mx = 0 for q in query_pool_copy.keys(): Mx += 1 for subq in q: if subq not in document: Mx -= 1 break for subq in curQuery[0]: if subq not in document: Mx += 1 break if random.uniform(0, 1) < 1.0 * adjustment / Mx: sample.append(document) print 'sample num:', len(sample), ' query cost:', query_cost # accept with prob of 1/M(X) # else continue else: query_pool.pop(curQuery[0]) print >> perr, query_cost, 'used for sampling.' with open('sample_' + str(query_cost), 'wb') as f: pickle.dump(sample, f) return sample
def sota_estimator(query_pool, api, match_term, uniqueid, query_num): """ A method to estimate the aggregation of a search engine's corpus efficient ------**Efficient search engine measurements** :param query_pool: A dict contains the queries and their benefits. {set(['yong','jun']):5} :param api: An implementation of simapi for specific api. :param match_term: Some fields for matching queries and returned document. :param uniqueid: The uniqueid of returned messages. :param query_num: The number of queries you want to estimate :return: count(*) of the search engine """ count = 0 query_cost = 0 params = api.getKwargs() for i in range(query_num): # choose one query curQuery = random.choice(query_pool.items()) params[api.getSearchTerm()] = '+'.join(curQuery[0]) result = api.callAPI(params=params) query_cost += 1 if len(result) == 0: continue # estimate weight for each query for row in result: try: r_id = eval(uniqueid) except KeyError: continue document = '' for term in match_term: try: document += data_process.alphnum(eval(term).lower()) + ' ' except KeyError: continue # get a set of queries match document match_query = [] for q in query_pool: match_query.append(q) for subq in q: if subq not in document: match_query.pop() break if curQuery[0] not in match_query: match_query.append(curQuery[0]) # estimate weight for each edge t = 0 while True: t += 1 query = random.choice(match_query) if query == curQuery[0]: count += 1.0 * t / len(match_query) print 'count: ', count, ' query cost: ', query_cost break params[api.getSearchTerm()] = '+'.join(query) mresult = api.callAPI(params=params) query_cost += 1 if len(mresult) == 0: continue for mrow in mresult: try: if r_id == eval('m' + uniqueid): count += 1.0 * t / len(match_query) print 'count: ', count, ' query cost: ', query_cost break except KeyError: continue else: continue break count = 1.0 * count * len(query_pool) / query_num print 'query cost: ', query_cost, ' count: ', count