def get_ranked_queries(self, text=''): """ loads the background document model and generates the ranked queries :return: the queries in a list """ if not text: text = self.page_html backgroundfile = 'background.txt' filename = raw_input("enter the filename of the background file, background.txt is default") if filename: backgroundfile = filename print "background file is ", backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() #doc_extractor.extract_queries_from_html(self.page_html) doc_extractor.extract_queries_from_html(text) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) #query_list = query_generator.extract_queries_from_html(self.page_html) query_list = query_generator.extract_queries_from_html(text) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(self.mq) query_list = [] for query in queries: query_list.append(query[0]) return query_list
def main(): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-b","--backgroundfile", type=str, help ="The filename name containing background term counts") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 cache = None if args.cache: cache = 'engine' if args.key: engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) else: print "cache is ", cache engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) stopwordfile = None if args.stopwordfile: stopwordfile = args.stopwordfile mq = 50 if args.maxqueries: mq = args.maxqueries backgroundfile = 'background.txt' if args.backgroundfile: backgroundfile = args.backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) page_html = pc.get_page_sourcecode() print "Page loaded" doc_extractor.extract_queries_from_html(page_html) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) query_list = query_generator.extract_queries_from_html(page_html) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(mq) query_list = [] for query in queries: query_list.append(query[0]) prc = PageRetrievabilityCalculator(engine=engine) prc.score_page(args.url, query_list) print "\nRetrievability Scores for cumulative c=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0