示例#1
0
    def get_ranked_queries(self, text=''):
        """
        loads the background document model and generates the ranked queries
        :return: the queries in a list
        """
        if not text:
            text = self.page_html
        backgroundfile = 'background.txt'
        filename = raw_input("enter the filename of the background file, background.txt is default")
        if filename:
            backgroundfile = filename
        print "background file is ", backgroundfile

        doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile)
        query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        print "Loading background distribution"
        colLM = LanguageModel(file=backgroundfile)
        print "Background loaded, number of terms: ", colLM.get_num_terms()
        #doc_extractor.extract_queries_from_html(self.page_html)
        doc_extractor.extract_queries_from_html(text)
        doc_term_counts = doc_extractor.query_count
        print "Number of terms in document: %d" % (len(doc_term_counts))
        docLM = LanguageModel(term_dict=doc_term_counts)
        slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
        #query_list = query_generator.extract_queries_from_html(self.page_html)
        query_list = query_generator.extract_queries_from_html(text)

        print "Queries generated: ", len(query_list)
        qr = OddsRatioQueryRanker(smoothed_language_model=slm)
        scored_queries = qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(self.mq)
        query_list = []
        for query in queries:
            query_list.append(query[0])
        return query_list
示例#2
0
    def make_topic_language_model(self):
        """
        Combines term counts from the topic and background to produce the language model.
        """
        topic_text = self._make_topic_text()

        # Get term counts from the TREC topic title and description.
        topic_terms = extract_term_dict_from_text(topic_text,
                                                  self._stopword_file)

        # Get term counts from the topic background.
        background_terms = self._topic.background_terms

        combined_term_counts = {}
        combined_term_counts = self._combine_dictionaries(
            combined_term_counts, topic_terms, self.topic_weighting)
        combined_term_counts = self._combine_dictionaries(
            combined_term_counts, background_terms,
            self.topic_background_weighting)

        # Build the LM from the combined count dictionary.
        language_model = LanguageModel(term_dict=combined_term_counts)
        self.topic_language_model = language_model

        log.debug("Making topic {0}".format(self._topic.id))
示例#3
0
    def __update_topic_language_model(self, text_list):

        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            new_language_model, self.background_language_model, self.mu)

        log.debug("Updating topic {0}".format(self._topic.id))
示例#4
0
    def _update_topic_language_model(self, text_list):
        """
        Updates the language model for the topic, given snippet/document text (text_list) and prior (knowledge) text.
        """
        topic_text = self._make_topic_text()
        document_text = ' '.join(text_list)

        topic_term_counts = extract_term_dict_from_text(
            topic_text, self._stopword_file)
        background_scores = self._topic.background_terms
        document_term_counts = extract_term_dict_from_text(
            document_text, self._stopword_file)

        combined_term_counts = {}
        combined_term_counts = self._combine_dictionaries(
            combined_term_counts, topic_term_counts, self.topic_weighting)
        combined_term_counts = self._combine_dictionaries(
            combined_term_counts, background_scores,
            self.topic_background_weighting)
        combined_term_counts = self._combine_dictionaries(
            combined_term_counts, document_term_counts,
            self.document_weighting)

        # Build the updated language model.
        new_language_model = LanguageModel(term_dict=combined_term_counts)
        self.topic_language_model = new_language_model
        log.debug("Updating topic {0}".format(self._topic.id))
    def update_model(self, search_context):
        if not self.updating:
            return False

        snippet_text = self._get_snip_text(search_context)
        snippet_text = self._check_terms(snippet_text)

        if snippet_text:
            topic_text = search_context.topic.get_topic_text()
            all_text = '{0} {1}'.format(topic_text, snippet_text)

            #snippet_term_counts = lm_methods.extract_term_dict_from_text(snippet_text, self._stopword_file)
            #topic_term_counts = lm_methods.extract_term_dict_from_text(topic_text, self._stopword_file)
            #title_language_model = LanguageModel(term_dict=topic_term_counts)
            #snippet_language_model = LanguageModel(term_dict=snippet_term_counts)
            #topic_language_model = BayesLanguageModel(title_language_model, snippet_language_model, beta=10)

            term_counts = lm_methods.extract_term_dict_from_text(
                all_text, self._stopword_file)
            language_model = LanguageModel(term_dict=term_counts)

            self.topic_lang_model = language_model
            if self.background_language_model:
                smoothed_topic_language_model = SmoothedLanguageModel(
                    language_model, self.background_language_model)
                self.topic_lang_model = smoothed_topic_language_model

            return True
        else:
            return False
示例#6
0
    def _update_topic_language_model(self, text_list):
        topic_text = self._make_topic_text(document_text=text_list)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = new_language_model

        log.debug("Updating topic {0}".format(self._topic.id))
示例#7
0
    def read_in_background(self, vocab_file):
        vocab = {}
        f = open(vocab_file, 'r')
        for line in f:
            tc = line.split(',')
            vocab[tc[0]] = int(tc[1])

        self.backgroundLM = LanguageModel(term_dict=vocab)
示例#8
0
    def make_topic_lm(self, topic):
        topic_text = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        topicLM = LanguageModel(term_dict=doc_term_counts)

        return topicLM
示例#9
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100)
        print "making topic", self.topicLM.docLM.total_occurrences
示例#10
0
 def _generate_topic_language_model(self, topic):
     """
     
     """
     topic_text = topic.title
     topic_background = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     document_extractor.extract_queries_from_text(topic_background)
     
     background_term_counts = document_extractor.query_count
     
     title_language_model = LanguageModel(term_dict=document_term_counts)
     background_language_model = LanguageModel(term_dict=background_term_counts)
     topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
     return topic_language_model
示例#11
0
    def make_topic_lm(self, topic):

        topic_text = topic.title
        topic_bg = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count

        doc_extractor.extract_queries_from_text(topic_bg)

        bg_term_counts = doc_extractor.query_count

        titleLM = LanguageModel(term_dict=doc_term_counts)

        bgLM = LanguageModel(term_dict=bg_term_counts)

        topicLM = BayesLanguageModel(titleLM, bgLM, beta=10)

        return topicLM
def compute_info_gain(word_list, language_model):

    word_dict = dict()
    for word in word_list:
        if word in word_dict:
            word_dict[word] +=1
        else:
            word_dict[word] = 1
    wlm = LanguageModel(term_dict = word_dict)

    ig = 0.0

    for word in word_dict:
        pw = wlm.get_term_prob(word)
        pwc = language_model.get_term_prob(word)
        g = 0.0
        if pwc > 0.0:
            g = pw * (math.log(pwc)- math.log(pw))
        ig += g


    return ig
示例#13
0
 def read_in_background(self, vocab_file):
     """
     Helper method to read in a file containing terms and construct a background language model.
     """
     vocab = {}
     f = open(vocab_file, 'r')
     
     for line in f:
         tc = line.split(',')
         vocab[tc[0]] = int(tc[1])
     
     f.close()
     self.background_language_model = LanguageModel(term_dict=vocab)
示例#14
0
    def make_topic_language_model(self):
        """
        Generates a topic language model.
        """
        topic_text = self._make_topic_text()
        document_term_counts = extract_term_dict_from_text(
            topic_text, self._stopword_file)

        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = language_model

        #SmoothedLanguageModel(language_model, self.background_language_model, 100)
        log.debug("Making topic {0}".format(self._topic.id))
示例#15
0
 def _generate_naive_topic_language_model(self, topic):
     """
     Given a Topic object, returns a language model representation for the given topic.
     Override this method in inheriting classes to generate and return different language models.
     """
     topic_text = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text.
     topic_language_model = LanguageModel(term_dict=document_term_counts)
     return topic_language_model
示例#16
0
    def _generate_topic_language_model(self, search_context):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic = search_context.topic
        topic_text = "{0} {1}".format(topic.title, topic.content)

        document_term_counts = lm_methods.extract_term_dict_from_text(
            topic_text, self._stopword_file)

        # The language model we return is simply a representation of the number of times terms occur within the topic text.
        topic_language_model = LanguageModel(term_dict=document_term_counts)
        return topic_language_model
示例#17
0
def read_in_background(vocab_file):
    """
    Helper method to read in a file containing terms and construct a background language model.
    Returns a LanguageModel instance trained on the vocabulary file passed.
    """
    vocab = {}
    f = open(vocab_file, 'r')

    for line in f:
        tc = line.split(',')
        vocab[tc[0]] = int(tc[1])

    f.close()
    return LanguageModel(term_dict=vocab)
示例#18
0
    def _generate_topic_language_model(self, topic):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic_text = topic.title
        topic_background = topic.content

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        document_extractor.extract_queries_from_text(topic_background)

        background_term_counts = document_extractor.query_count

        title_language_model = LanguageModel(term_dict=document_term_counts)
        background_language_model = LanguageModel(
            term_dict=background_term_counts)
        topic_language_model = BayesLanguageModel(title_language_model,
                                                  background_language_model,
                                                  beta=10)
        return topic_language_model
示例#19
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = self._topic.content + self._topic.title

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, 100)
        print "making topic", self.topic_language_model.docLM.total_occurrences
示例#20
0
    def _generate_topic_language_model(self, search_context):
        """
        creates an empirical language model based on the search topic, or a smoothed language model if a background model has been loaded.
        """
        topic_text = self._make_topic_text(search_context)
        topic_term_counts = lm_methods.extract_term_dict_from_text(
            topic_text, self._stopword_file)

        topic_language_model = LanguageModel(term_dict=topic_term_counts)
        if self.background_language_model:
            smoothed_topic_language_model = SmoothedLanguageModel(
                topic_language_model, self.background_language_model)
            return smoothed_topic_language_model
        else:
            return topic_language_model
示例#21
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(
            self.topic_language_model.docLM.total_occurrences))
示例#22
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0