Exemplo n.º 1
0
    def collect_feedback_words(self):
        self.words_to_add = []
        self.NEList = []
        id_topic = self.id_topic
        if id_topic in self.feedbacks:
            for feedback in self.feedbacks[id_topic]:
                if(feedback[1] not in self.rel_docs):
                    self.rel_docs.append(feedback[1])
                words_topic_name = feedback[2]
                passage_text = feedback[3]
                try:
                    tokens = nltk.word_tokenize(' '.join(diversification.NErecognition(passage_text)))
                    tokens_topic_name = nltk.word_tokenize(words_topic_name)
                    self.words_to_add += tokens_topic_name
                    NEList = list(set(tokens))
                    print 'Nelist',NEList
                    print 'Topics title',tokens_topic_name

#                     if(self.domain_name != 'local politics'):
                    if(self.domain_name):
                        NEList = [pxml.stem_and_lemmatize(word) for word in NEList if pxml.lemmatize(word.lower()) not in stopwords.words('english')]
                        tokens_topic_name = [pxml.stem_and_lemmatize(word) for word in tokens_topic_name if pxml.lemmatize(word.lower()) not in stopwords.words('english')]
                except UnicodeError:
                    NEList = []  
                self.NEList += NEList
            return list(set(self.NEList)), list(set(self.words_to_add))
        return [], []
Exemplo n.º 2
0
 def process_words_feedback(self, words_to_add):
     list_words_to_add = []
     for words in words_to_add:
         list_words_to_add += nltk.word_tokenize(words.lower())
     list_words_to_add = list(set(list_words_to_add))
     if(self.domain_name):
         list_words_to_add = [pxml.stem_and_lemmatize(word) for word in list_words_to_add]
     return list_words_to_add
Exemplo n.º 3
0
 def process_query(self,query):
     
     self.domain_name = self.map_name_domaine[self.dict_query_domain[query]]
     query = re.sub(r'[^\w]', ' ', query)
     query = nltk.word_tokenize(query)
     if(self.domain_name):
         query = [pxml.stem_and_lemmatize(word) for word in query]
     query = ' '.join(query)
     self.raw_query = query
     self.query = query
Exemplo n.º 4
0
 def format_query(self,query):
     query = re.sub(r'[^\w]', ' ', query)
     query = nltk.word_tokenize(query.lower())
     query = [pxml.stem_and_lemmatize(word) for word in query]
     return query