Пример #1
0
    def get_ranked_queries(self, text=''):
        """
        loads the background document model and generates the ranked queries
        :return: the queries in a list
        """
        if not text:
            text = self.page_html
        backgroundfile = 'background.txt'
        filename = raw_input("enter the filename of the background file, background.txt is default")
        if filename:
            backgroundfile = filename
        print "background file is ", backgroundfile

        doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile)
        query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        print "Loading background distribution"
        colLM = LanguageModel(file=backgroundfile)
        print "Background loaded, number of terms: ", colLM.get_num_terms()
        #doc_extractor.extract_queries_from_html(self.page_html)
        doc_extractor.extract_queries_from_html(text)
        doc_term_counts = doc_extractor.query_count
        print "Number of terms in document: %d" % (len(doc_term_counts))
        docLM = LanguageModel(term_dict=doc_term_counts)
        slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
        #query_list = query_generator.extract_queries_from_html(self.page_html)
        query_list = query_generator.extract_queries_from_html(text)

        print "Queries generated: ", len(query_list)
        qr = OddsRatioQueryRanker(smoothed_language_model=slm)
        scored_queries = qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(self.mq)
        query_list = []
        for query in queries:
            query_list.append(query[0])
        return query_list
Пример #2
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0