def show_layers_effect(query_set, searcher, layers_combs): ''' Shows performance for different layers configuration. ''' queries = load_query_set(query_set, 40) legend = {'P': 'papers_relev', 'A': 'authors_relev', 'K': 'words_relev', 'V': 'venues_relev'} params = {'age_relev': 0.0, 'query_relev': 0.0, 'ctx_relev': 0.0} for layers in layers_combs : print "%s\t" % layers, for layer in legend: params[legend[layer]] = 1.0 if (layer in layers) else 0.0 searcher.set_params(**params) get_search_metrics(queries, searcher, show=True, force=True, results_file=("%s/results/layers_effect/%s.p" % (config.DATA, layers))) print
def save_scholar_results(query_set, n): ''' For each given query, request on google scholar, search for textually similar entries on the index and show them to the user for confirmation. ''' from scholar_api import ScholarQuerier, SearchScholarQuery queries = load_query_set(query_set) querier = ScholarQuerier() scholar_query = SearchScholarQuery() # Folder to store saved results # from_folder = config.QUERY_SETS_PATH + "manual" save_folder = "%s/scholar/%s" % (config.DATA, config.DATASET) for query, query_id, _, _, _, _ in queries: # Only searches if file doesn't already exists file_path = "%s/%s.txt" % (save_folder, query_id) if os.path.exists(file_path): continue time.sleep(1) print "\nProcessing '%s'" % query, scholar_query.set_words(query) # We stop requesting new pages once we found at least n start = 0 titles = [] while (len(titles) < n): scholar_query.set_start(start) querier.send_query(scholar_query) # Check if we got a captcha as response if (len(querier.articles) == 0): if (start == 0): raise Exception( "Request probably got blocked due to overload.") else: # If we got 0 article after some requests it may be that # all articles were fetched. Skip to next query. break # Get only titles and try to find entries in our dataset for them for article in querier.articles: titles.append(article['title'].strip('. ')) # Set correct pagination start += 20 # Write to file with open(file_path, 'a') as file: print >> file, query for title in titles: print >> file, "%s" % (title.encode("UTF-8"))
def save_layers_results_query_set(query_set) : queries = load_query_set(query_set, 10) queries = [query for query,_,_,_,_,_ in queries] layers = ['paper', 'author', 'venue', 'ngram'] searcher = Searcher(**PARAMS) for layer in layers : folder = "%s/results/layers/%s/%s/" % (config.DATA, config.DATASET, layer) get_layer_results(queries, searcher, folder, layer)
def save_aminer_results(query_set): queries = load_query_set(query_set, limit=100) for query, pub_id, _, _, _, _ in queries: # folder = config.DATA + "aminer/" + query_set folder = "%s/aminer/%s" % (config.DATA, config.DATASET) if (not os.path.exists(folder)): os.makedirs(folder) # Only searches if file doesn't already exists file_path = "%s/%s.txt" % (folder, pub_id) if os.path.exists(file_path): continue titles = search_aminer(query, 20) print "%d\t%s" % (len(titles), query) with open(file_path, 'w') as file: print >> file, query for title in titles: print >> file, "%s" % (title.encode("UTF-8"))
def main() : # for qs in ["manual", "surveys", "testing"] : # print "\n%s" % qs # time_diversity(["MultiLayered", "TopCited(G)"], qs) # sys.exit() # config.IN_MODELS_FOLDER = config.DATA + "topic_models/%s_%d_%d" ## check_topics_effect(Searcher(**PARAMS), "manual") # show_layers_effect("manual", Searcher(**PARAMS), ["PAK", "PAT", "PAKT"][:1]) # save_layers_results_queries(["citation recommendation", # "author recommendation", # "link prediction"], # folder="/var/tmp/results") # save_layer_results("manual") # query_set = 'manual' # query_set = 'surveys' # query_set = 'tuning' # query_set = 'testing' # queries = load_query_set(query_set, 200) # vary_parameters(Searcher(**PARAMS), queries, 'age_relev', [0.0, 0.001, 0.01, 0.1, 0.25, 0.5, 1.0]) # vary_parameters(Searcher(**PARAMS), queries, 'ctx_relev', [0.0, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]) # vary_parameters(Searcher(**PARAMS), queries, 'query_relev', [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8]) # layers = ["P","PA","PV","PK","PAV","PAK","PKV","PAKV"] # show_layers_effect(queries, Searcher(**BEST_PARAMS), layers) # show_attenuators_effect(queries, Searcher(**PARAMS)) # vary_rho_values(Searcher(**PARAMS), queries, 'papers_relev', [0.05, 0.1, 0.25, 0.5, 0.75, 0.85, 0.9, 1.0]) # vary_rho_values(Searcher(**PARAMS), queries, 'authors_relev', [0.0, 0.1, 0.25, 0.5, 0.75, 0.9]) # vary_rho_values(Searcher(**PARAMS), queries, 'topics_relev', [0.0, 0.1, 0.25, 0.5, 0.75, 0.9]) # vary_rho_values(Searcher(**PARAMS), queries, 'words_relev', [0.0, 0.1, 0.25, 0.5, 0.75, 0.9]) # vary_rho_values(Searcher(**BEST_PARAMS), queries, 'venues_relev', [0.0, 0.05, 0.1, 0.25, 0.5]) # get_other_layers(load_query_set('manual', 10), Searcher(**BEST_PARAMS), "ngram") query_sets = [ # 'manual', # 'surveys', # 'tuning', 'testing' ] searchers = [ Searcher(**PARAMS), # Searcher(**config.PARAMS), # PageRankSubgraphSearcher(**PARAMS), # TopCitedSubgraphSearcher(**PARAMS), # TopCitedGlobalSearcher(), # TFIDFSearcher(), # BM25Searcher(), # CiteRankSearcher(tau=2.6), # PageRankFilterBeforeSearcher(), # PageRankFilterAfterSearcher(), # GoogleScholarSearcher(), # ArnetMinerSearcher(), #MengSearcher(), # CiteseerSearcher("eval/citeseer"), # WeightedTopCitedSubgraphSearcher(**PARAMS) ] for query_set in query_sets : log.info("Running '%s' query set.\n" % query_set) queries = load_query_set(query_set, 200) for s in searchers : print "%s\t" % s.name(), # print "\nRunning %s with %d queries from %s set..." % \ # (s.name(), len(queries), query_set) if s.name() == "MultiLayered": s.set_params(**{ 'K': 20, 'H': 1, 'papers_relev': 0.25, 'authors_relev': 0.25, 'words_relev': 0.25, 'topics_relev' : 0.0, 'venues_relev': 0.25, 'alpha': 0.3, 'query_relev': 0.3, 'age_relev': 0.01, 'ctx_relev': 0.5}) if s.name() == "TopCited(G)": # TopCitedSubgraphSearcher s.set_params(**{ 'K': 20, 'H': 1, 'papers_relev': 0.25, 'authors_relev': 0.25, 'words_relev': 0.25, 'topics_relev' : 0.0, 'venues_relev': 0.25, 'alpha': 0.3, 'query_relev': 0.3, 'age_relev': 0.01, 'ctx_relev': 0.5}) if s.name() == "WeightedTopCited(G)": s.set_params(**{ 'K': 20, 'H': 1, 'query_relev': 0.15, # 0.15 'age_relev': 0.01, # 0.01 'ctx_relev': 0.8, # 0.6 (manual), 0.8 'beta': 0.1}) # 0.1 rfile = get_results_file(query_set, s.name()) get_search_metrics(queries, s, force=True, results_file=rfile) del s
def check_topics_effect(searcher, query_set) : queries = load_query_set(query_set, 10) get_search_metrics(queries, searcher, force=True)
maps = [] for query, pub_id, _year, actual_docs, _rels, _titles in queries: top = self.searcher.search(query, limit=20, exclude=set([pub_id]), force=False) maps.append(apk(actual_docs, top, k=20)) return np.mean(maps) if __name__ == '__main__': query_set = 'manual' queries = load_query_set(query_set, 30) se = SearchEvaluator(Searcher(**config.PARAMS)) bocv = BayesianOptCV( estimator=se, param_bounds={ 'K': (5, 50), 'papers_relev': (0.001, 1.0), 'authors_relev': (0.001, 1.0), 'venues_relev': (0.001, 1.0), 'words_relev': (0.001, 1.0), 'alpha': (0.1, 0.9), 'age_relev': (0.001, 1.0), 'query_relev': (0.001, 1.0), 'ctx_relev': (0.01, 10.0) },