def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100) print "making topic", self.topicLM.docLM.total_occurrences
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def update_model(self, search_context): if not self.updating: return False snippet_text = self._get_snip_text(search_context) snippet_text = self._check_terms(snippet_text) if snippet_text: topic_text = search_context.topic.get_topic_text() all_text = '{0} {1}'.format(topic_text, snippet_text) #snippet_term_counts = lm_methods.extract_term_dict_from_text(snippet_text, self._stopword_file) #topic_term_counts = lm_methods.extract_term_dict_from_text(topic_text, self._stopword_file) #title_language_model = LanguageModel(term_dict=topic_term_counts) #snippet_language_model = LanguageModel(term_dict=snippet_term_counts) #topic_language_model = BayesLanguageModel(title_language_model, snippet_language_model, beta=10) term_counts = lm_methods.extract_term_dict_from_text( all_text, self._stopword_file) language_model = LanguageModel(term_dict=term_counts) self.topic_lang_model = language_model if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model) self.topic_lang_model = smoothed_topic_language_model return True else: return False
class IFindTextClassifier(BaseTextClassifier): """ """ def __init__(self, topic, stopword_file=[], background_file=[]): """ """ super(IFindTextClassifier, self).__init__(topic, stopword_file, background_file) self.make_topic_language_model() def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences def is_relevant(self, document): """ """ score = 0.0 count = 0.0 for term in document.title.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 for term in document.content.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 if (score / count) > self.threshold: return True return False def __get_term_score(self, term): """ Returns a probability score for the given term when considering both the background and topic language models. """ topic_term_prob = self.topic_language_model.get_term_prob(term) background_term_prob = self.background_language_model.get_term_prob( term) if background_term_prob == 0.0: return 0.0 else: return math.log(topic_term_prob / background_term_prob, 2)
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format( self.topic_language_model.docLM.total_occurrences))
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100) print "making topic", self.topicLM.docLM.total_occurrences
class IFindTextClassifier(BaseTextClassifier): """ """ def __init__(self, topic, stopword_file=[], background_file=[]): """ """ super(IFindTextClassifier, self).__init__(topic, stopword_file, background_file) self.make_topic_language_model() def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences def is_relevant(self, document): """ """ score = 0.0 count = 0.0 for term in document.title.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 for term in document.content.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 if (score / count) > self.threshold: return True return False def __get_term_score(self, term): """ Returns a probability score for the given term when considering both the background and topic language models. """ topic_term_prob = self.topic_language_model.get_term_prob(term) background_term_prob = self.background_language_model.get_term_prob(term) if background_term_prob == 0.0: return 0.0 else: return math.log(topic_term_prob/background_term_prob, 2)
class iFindTextClassifier(TextClassifier): def __init__(self, stopword_file=[], background_file=[]): TextClassifier.__init__(self, stopword_file, background_file) self.topicLM = None self.threshold = -0.20 def set_topic(self, topic): self.topic = topic self.make_topic_lm() def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100) print "making topic", self.topicLM.docLM.total_occurrences def is_relevant(self, document): #print "computing relevance", document.docid score = 0.0 count = 0.0 for t in document.title.split(' '): score = score + self._get_term_score(t) count += 1.0 for t in document.content.split(' '): score = score + self._get_term_score(t) count += 1.0 if (score/count) > self.threshold: return True else: return False def _get_term_score(self, term): ptd = self.topicLM.get_term_prob(term) pt = self.backgroundLM.get_term_prob(term) if pt == 0.0: return 0.0 else: return math.log( ptd/pt, 2)
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
def _generate_topic_language_model(self, search_context): """ creates an empirical language model based on the search topic, or a smoothed language model if a background model has been loaded. """ topic_text = self._make_topic_text(search_context) topic_term_counts = lm_methods.extract_term_dict_from_text( topic_text, self._stopword_file) topic_language_model = LanguageModel(term_dict=topic_term_counts) if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( topic_language_model, self.background_language_model) return smoothed_topic_language_model else: return topic_language_model
class iFindTextClassifier(TextClassifier): def __init__(self, stopword_file=[], background_file=[]): TextClassifier.__init__(self, stopword_file, background_file) self.topicLM = None self.threshold = -0.20 def set_topic(self, topic): self.topic = topic self.make_topic_lm() def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100) print "making topic", self.topicLM.docLM.total_occurrences def is_relevant(self, document): #print "computing relevance", document.docid score = 0.0 count = 0.0 for t in document.title.split(' '): score = score + self._get_term_score(t) count += 1.0 for t in document.content.split(' '): score = score + self._get_term_score(t) count += 1.0 if (score / count) > self.threshold: return True else: return False def _get_term_score(self, term): ptd = self.topicLM.get_term_prob(term) pt = self.backgroundLM.get_term_prob(term) if pt == 0.0: return 0.0 else: return math.log(ptd / pt, 2)
class IFindTextClassifier(BaseTextClassifier): """ """ def __init__(self, topic, search_context, stopword_file=[], background_file=[]): """ """ super(IFindTextClassifier, self).__init__(topic, search_context, stopword_file, background_file) self.threshold = 0.0 self.mu = 100.0 self.make_topic_language_model() def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format( self.topic_language_model.docLM.total_occurrences)) def is_relevant(self, document): """ """ score = 0.0 count = 0.0 for term in document.title.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 for term in document.content.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 self.doc_score = (score / count) if self.doc_score > self.threshold: return True return False def __get_term_score(self, term): """ Returns a probability score for the given term when considering both the background and topic language models. """ topic_term_prob = self.topic_language_model.get_term_prob(term) background_term_prob = self.background_language_model.get_term_prob( term) if background_term_prob == 0.0: return 0.0 else: return math.log(topic_term_prob / background_term_prob, 2.0) def update_model(self, search_context): if self.updating: ## Once we develop more update methods, it is probably worth making this a strategy ## so that setting the update_method changes the list of documents to use. if self.update_method == 1: document_list = search_context.get_all_examined_documents() else: document_list = search_context.get_all_examined_snippets() # iterate through document_list, pull out relevant snippets / text rel_text_list = [] for doc in document_list: if doc.judgment > 0: rel_text_list.append('{0} {1}'.format( doc.title, doc.content)) if rel_text_list: self.__update_topic_language_model(rel_text_list) return True else: return False def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
class IFindTextClassifier(BaseTextClassifier): """ """ def __init__(self, topic, stopword_file=[], background_file=[]): """ """ super(IFindTextClassifier, self).__init__(topic, stopword_file, background_file) self.threshold = 0.0 self.mu = 100.0 self.make_topic_language_model() def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences)) def is_relevant(self, document): """ """ score = 0.0 count = 0.0 for term in document.title.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 for term in document.content.split(' '): score = score + self.__get_term_score(term) count = count + 1.0 self.doc_score = (score/count) if self.doc_score > self.threshold: return True return False def __get_term_score(self, term): """ Returns a probability score for the given term when considering both the background and topic language models. """ topic_term_prob = self.topic_language_model.get_term_prob(term) background_term_prob = self.background_language_model.get_term_prob(term) if background_term_prob == 0.0: return 0.0 else: return math.log(topic_term_prob/background_term_prob, 2.0) def update_model(self, search_context): if self.updating: ## Once we develop more update methods, it is probably worth making this a strategy ## so that setting the update_method changes the list of documents to use. if self.update_method == 1: document_list = search_context.get_all_examined_documents() else: document_list = search_context.get_all_examined_snippets() # iterate through document_list, pull out relevant snippets / text rel_text_list = [] for doc in document_list: if doc.judgment > 0: rel_text_list.append('{0} {1}'.format(doc.title, doc.content)) if rel_text_list: self.__update_topic_language_model(rel_text_list) return True else: return False def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))