def create_dictionary(self,feature_to_do): self.dictionary = {} previously_searched = {} different_searched = {} stop_words = methods.load_stop_words() tweet_list = self.tweets.get_training_base() for i in feature_to_do: self.dictionary[self.features[i]] = set([]) for tweet in tweet_list: values = tweet.get(self.features[i]) if isinstance(values,list): for value in values: if value not in stop_words: if (self.features[i],value) not in previously_searched: previously_searched[(self.features[i],value)] = 0 self.dictionary[self.features[i]].add(value) previously_searched[(self.features[i],value)] += 1 else: if (self.features[i],values) not in previously_searched: previously_searched[(self.features[i],values)] = 0 previously_searched[(self.features[i],values)] += 1 self.dictionary[self.features[i]].add(values) self.tweets.set_previously_searched(dict(previously_searched))
def split_text(self,text,min_tam_word): stop_words = methods.load_stop_words() filtered_text = text if filtered_text != None: temp = filtered_text.split() if min_tam_word > 0: filtered_text = [] for word in temp: processed_word = methods.process_text(word,min_tam_word) if processed_word != "" and (processed_word not in stop_words): filtered_text.append(processed_word) else: filtered_text = temp return filtered_text
def create_base_list(self,tweet_list,feature_to_do): stop_words = methods.load_stop_words() label = [] value = [] for tweet in tweet_list: i = 0 temp_label = 0 #temp_value = {} temp_value = [] found = False for j in feature_to_do: values = tweet.get(self.features[j]) for word in self.dictionary[self.features[j]]: if isinstance(values,list): p = 0 if word in values: if not found: if tweet.get_manual_classification() == "important": temp_label = 1 else: temp_label = -1 found = True p = self.idf(self.tweets.get_training_base(),tweet,word,j) #temp_value[i] = p temp_value.append(p) else: if word == values: if not found: if tweet.get_manual_classification() == "important": temp_label = 1 else: temp_label = -1 found = True p = self.idf(self.tweets.get_training_base(),tweet,word,j) #temp_value[i] = p temp_value.append(p) i += 1 if temp_label != 0: value.append(tuple(temp_value)) label.append(temp_label) return label,value