def update_model(self, search_context): if not self.updating: return False snippet_text = self._get_snip_text(search_context) snippet_text = self._check_terms(snippet_text) if snippet_text: topic_text = search_context.topic.get_topic_text() all_text = '{0} {1}'.format(topic_text, snippet_text) #snippet_term_counts = lm_methods.extract_term_dict_from_text(snippet_text, self._stopword_file) #topic_term_counts = lm_methods.extract_term_dict_from_text(topic_text, self._stopword_file) #title_language_model = LanguageModel(term_dict=topic_term_counts) #snippet_language_model = LanguageModel(term_dict=snippet_term_counts) #topic_language_model = BayesLanguageModel(title_language_model, snippet_language_model, beta=10) term_counts = lm_methods.extract_term_dict_from_text( all_text, self._stopword_file) language_model = LanguageModel(term_dict=term_counts) self.topic_lang_model = language_model if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model) self.topic_lang_model = smoothed_topic_language_model return True else: return False
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100) print "making topic", self.topicLM.docLM.total_occurrences
def _generate_topic_language_model(self, search_context): """ creates an empirical language model based on the search topic, or a smoothed language model if a background model has been loaded. """ topic_text = self._make_topic_text(search_context) topic_term_counts = lm_methods.extract_term_dict_from_text( topic_text, self._stopword_file) topic_language_model = LanguageModel(term_dict=topic_term_counts) if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( topic_language_model, self.background_language_model) return smoothed_topic_language_model else: return topic_language_model
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format( self.topic_language_model.docLM.total_occurrences))