def set_engine(self): #if self.cache: #todo currently defaults to using cache, may want to change this self.cache = 'engine' if self.key: self.engine = EngineFactory(engine=self.engine_name, api_key=self.key, throttle=0.25, cache=self.cache) else: self.engine = EngineFactory(engine=self.engine_name, cache=self.cache, throttle=0.25) if self.domain: self.engine.site = self.domain
def main(): engine = EngineFactory( engine='Sitebing', api_key="msRh5UoZzyV3qvroEpzXMzbZEVjW3ENfTGMAQO1yuRc", throttle=0.1, cache='engine') query_generator = TriTermQueryGeneration(minlen=TERM_LEN, stopwordfile=STOPWORD_FILE, maxsize=100) tuple_list = get_trending_queries(URL_FILE) page_calculator = PageRetrievabilityCalculator(engine=engine, cutoff=CUTOFF, generator=query_generator) with open(RESULT_FILE, 'a') as f: for tuple in tuple_list: url = tuple[1] findability = tuple[0] category_name = tuple[2] retrievability = page_calculator.score_page(url) s = page_calculator.stats() f.write('{0},{1},{2},{3},{4},{5}\n'.format(category_name, url, findability, retrievability, s['retrieved'], s['query_count']))
def setUp(self): self.logger = logging.getLogger("TestPageRetCalc") #currently engine set to govuk, may need to change this self.engine = EngineFactory(engine="govuk") #url may need to be changed self.url = "https://www.gov.uk/renew-adult-passport" self.pg_calc = PageRetrievabilityCalculator(engine=self.engine)
def fetch_results(queries_list): """Builds a list of tuples (category,url,rank) and returns it """ myengine = EngineFactory('bing',api_key=API_KEY) result_list =[] for term in queries_list: query = Query(term[1], top=30) response = myengine.search(query) #TODO implement select_ranks properly maybe (num_to_select,step) rank_list = select_ranks(6,10) #TODO make this arguments for rank in rank_list: #term[0] is trend categoty, term[1] is search term try: result_list.append((term[0], response.results[rank].url, rank)) #print "appended" + term[0] + response.results[rank].url except IndexError: print "index error.." print result_list[:] return result_list
def fetch_results(queries_list): """Builds a list of tuples (category,url,rank) and returns it """ myengine = EngineFactory('bing', api_key=API_KEY) result_list = [] for term in queries_list: query = Query(term[1], top=30) response = myengine.search(query) #TODO implement select_ranks properly maybe (num_to_select,step) rank_list = select_ranks(6, 10) #TODO make this arguments for rank in rank_list: #term[0] is trend categoty, term[1] is search term try: result_list.append((term[0], response.results[rank].url, rank)) #print "appended" + term[0] + response.results[rank].url except IndexError: print "index error.." print result_list[:] return result_list
def test_game_scoring(self): self.logger.info("Testing Game Scoring with a Dummy SearchEngine") #print "Testing Game Scoring with a Dummy SearchEngine" se = EngineFactory("Dummy") u = User.objects.get(username='******') c = Category.objects.get(name='Numbers') gm = GameMechanic(se) gm.create_game(u, c) self.logger.info("Checking if the category Numbers has four pages.") #print "Checking if the category Numbers has four pages." self.assertEquals(len(gm.pages), 4) gm.handle_query('one') gm.take_points() gm.set_next_page() self.logger.info("Checking whether the query, one, scores 1000 points-\ which it should given the data and dummy search engine" ) #print "Checking whether the query, one, scores 1000 points - #which it should given the data and dummy search engine" self.assertEquals(gm.get_current_score(), 1000)
def main(): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-b","--backgroundfile", type=str, help ="The filename name containing background term counts") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 cache = None if args.cache: cache = 'engine' if args.key: engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) else: print "cache is ", cache engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) stopwordfile = None if args.stopwordfile: stopwordfile = args.stopwordfile mq = 50 if args.maxqueries: mq = args.maxqueries backgroundfile = 'background.txt' if args.backgroundfile: backgroundfile = args.backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) page_html = pc.get_page_sourcecode() print "Page loaded" doc_extractor.extract_queries_from_html(page_html) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) query_list = query_generator.extract_queries_from_html(page_html) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(mq) query_list = [] for query in queries: query_list.append(query[0]) prc = PageRetrievabilityCalculator(engine=engine) prc.score_page(args.url, query_list) print "\nRetrievability Scores for cumulative c=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0
def setup(self): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-d","--domain",type=str, help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 else: self.url = args.url # cache = None # if args.cache: # cache = 'engine' # if args.key: # engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) # else: # print "cache is ", cache # engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) cache = None if args.cache: self.cache = 'engine' else: self.cache = cache if args.key: self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache) else: self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1) if args.domain: self.engine.site = args.domain stopwordfile = None if args.stopwordfile: self.stopwordfile = args.stopwordfile else: self.stopwordfile = None self.mq = 250 if args.maxqueries: self.mq = args.maxqueries print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) self.page_html = pc.get_page_sourcecode() print "Page loaded" self.page_text = '' # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n") # if answer == 'y': # percent = raw_input("What percentage do you want to use? \n") # if self.is_integer(percent): # self.page_text = self.reduce_page(percentage=percent) # else: # print "input error, will exit" # sys.exit(2) # #todo update so asks again, not exit # else: self.page_text = self.page_html query_list = [] answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y text = self.get_position_text() #todo at this stage this could be single, bi or tri terms query_gen = None if self.stopwordfile: query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile) else: query_gen = BiTermQueryGeneration(minlen=3) query_list = query_gen.extract_queries_from_text(text) elif answer == 'n': answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y: query_list = self.get_ranked_queries() elif answer == 'n': answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y: text = self.get_position_text() query_list = self.get_ranked_queries(text) elif answer == 'n': print "sorry, that's all the options, system will exit" sys.exit(0) print "Queries generated: ", len(query_list) prc = None if args.cutoff: prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq) else: prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq) prc.score_page(self.url, query_list) print "\nRetrievability Scores for cumulative pce=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0
def main(): logger = create_ifind_logger('test_game_mech.log') logger.info("Program started") logger.info('Testing game mechanics') print "This script is to test the GameMechanics and interaction with the Models" ds = EngineFactory("Dummy") gm = GameMechanic(ds) print gm u = User.objects.filter(username='******') if u: u = u[0] else: print "Adding testy user" u = User(username='******', password='******') u.save() c = Category.objects.filter(name='Numbers') if c: c = c[0] else: print "Adding a Numbers Category" c = Category(name='Numbers', desc='Looking for sites that around about numbers') c.save() pages = Page.objects.filter(category=c) if not pages: print "Adding pages" for pn in ['one', 'two', 'three', 'four']: p = Page(category=c, title=pn, url='www.' + pn + '.com', snippet=pn, desc=('desc: ' + pn)) p.save() pages = Page.objects.filter(category=c) print u print c print pages gm.create_game(u, c) print gm print "Game is set up to play" raw_input('Press enter to continue') while not gm.is_game_over(): clear_screen() print gm last_query = gm.get_last_query() if last_query: print "\nLast Query: %s and Query Score: %d" % ( last_query, gm.get_last_query_score()) state = handle_game_input() if state == 1: gm.take_points() gm.set_next_page() state = 0 if state == 2: query = handle_query_input() gm.handle_query(query) print '\nGame Over!!\n' print gm logger.info("Done!")