def test_sota_sampler(self): local_file = os.path.abspath(os.path.dirname(__file__) + os.path.sep + "../../") + '/pkl_example/yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "business_id", ["name"], ["name", "full_address"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=self.yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1, samplenum=1) self.yelp.getSession().close() assert True
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/dblp_10000.pkl' localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"], ["row['title']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
def setUp(self): search_term = 'q' parameters = {'h': 1000} self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) localdata_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/csv_example/dblp_sample.csv' localdata = LocalData(localdata_file, 'csv', "key", ["title"], ["title"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) self.initQueries = initQueries
from deeperlib.api.yelp.searchapi import SearchApi from deeperlib.core import utils from deeperlib.data_processing.local_data import LocalData from deeperlib.estimator import sampler # ==== Sota-Sampler Yelp ==== client_id = "kCe2YbZePXsPnC204ZrXoQ" client_secret = "s9KnvEEQW7jaA2wlrBi4X2fnDQ0F7asdklXVvJUidWp8i50ov24E8EjkHX2AUhoL" search_term = 'term' parameters = {'limit': 50, 'location': 'AZ'} yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) local_file = 'yelp_3000.pkl' localdata = LocalData(local_file, 'pkl', "row['business_id']", ["row['name']"], ["row['name']", "row['full_address']"]) localdata_ids, localdata_query, localdata_er = localdata.getlocalData() initQueries = utils.queryGene(localdata_query, 2) sampler.sota_sampler(query_pool=initQueries, api=yelp, match_term=localdata.getQueryList(), top_k=300, adjustment=1) yelp.getSession().close()
def SmartCrawl(budget, api, sampledata, localdata, hiddendata, pool_thre=2, jaccard_thre=0.75, threads=4): time_s = timeit.default_timer() sample = sampledata.getSample() D1_ids, D1_query, D1_er = localdata.getlocalData() top_k = api.getTopk() sample_rate = sampledata.getRatio() / 100.0 Dratio = 1.0 * len(D1_ids) * sample_rate / len(sample) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'data loaded.' time_s = timeit.default_timer() initQueries = utils.queryGene(D1_query, pool_thre) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'query pool finished.' #####inverted index ##### time_s = timeit.default_timer() D1index = utils.invertedIndex(initQueries, D1_query) initQueries, D1index = utils.add_naiveIndex(initQueries, D1_query, D1index) sampleindex = utils.invertedIndex(initQueries, sample) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'index building finished.' #####forward index ##### time_s = timeit.default_timer() findex = utils.forwardIndex(D1index) time_e = timeit.default_timer() print >> perr, time_e - time_s, 'forward index' ##### biased ##### D1_ids_deeper = copy.deepcopy(D1_ids) query_pool = utils.initScore_biased(sampleindex, top_k, sample_rate, Dratio, initQueries) flagNum = len(query_pool) - budget curcov = set() curmat = [] updateList = utils.updateList(D1index) queryList = [] while len(query_pool) > flagNum and len(query_pool) != 0 and len(curcov) < len(D1_ids): queries = [] while len(queries) < threads: if len(query_pool) > flagNum and len(query_pool) > 0: top = query_pool.popitem() if updateList[top[0]] != 0: if len(sampleindex[top[0]]) <= top_k * sample_rate: if len(sampleindex[top[0]]) == 0 and len(D1index[top[0]]) > (top_k * Dratio): new_priority = top[1] - updateList[top[0]] * top_k * Dratio / len(D1index[top[0]]) else: new_priority = top[1] - updateList[top[0]] else: new_priority = top[1] - updateList[top[0]] * top_k * sample_rate / len(sampleindex[top[0]]) query_pool.additem(top[0], new_priority) updateList[top[0]] = 0 continue else: queries.append(list(top[0])) else: break queryList.extend(queries) cur_raw_result = api.callMulAPI(queries) cur_er_result = hiddendata.proResult(cur_raw_result) matched_ids, matched_pair = utils.results_simjoin(cur_er_result, D1_er, jaccard_thre) removed_ids = D1_ids_deeper.intersection(matched_ids) for d in removed_ids: for q in findex[d]: updateList[q] += 1 D1_ids_deeper.difference_update(matched_ids) curcov = curcov.union(matched_ids) curmat.extend(matched_pair) print >> perr, 'smartcrawl, coverage ratio:', 100.0 * len(curcov) / len(D1_ids), '%, ', \ len(cur_raw_result), 'results returned, ', \ len(matched_ids), 'local records covered at this iteration. ', \ len(hiddendata.getMergeResult()), 'different results returned, ', \ len(curcov), 'local records covered totally.' api.getSession().close() hiddendata.setQueryList(queryList) hiddendata.setMatchPair(curmat)