def __init__(self, type, key_words, secondary_words, excluding_words): self.type_name = type key_words_tree = Tree.AVLTree() self.key_words = key_words_tree.insert_array(key_words) secondary_words_tree = Tree.AVLTree() self.secondary_words = secondary_words_tree.insert_array( secondary_words) excluding_Words_tree = Tree.AVLTree() self.excluding_words = excluding_Words_tree.insert_array( excluding_words)
def palabras_repetidas_dictionary_with_tree(text_to_classify): import heapq import time start = time.time() print() print(" -------------------- Dictionary&Tree ----------------------- ") mongo_dictionary = DB.GET_dictionary_from_DB() #from mongodb dictionary = [] #como estan guardados por orden podemos cogerlos tal cual for i in range(Utils.MIN_TYPES, Utils.MAX_TYPES): key_words_tree = Tree.AVLTree() key_words_tree.insert_array(mongo_dictionary[i].key_words) secondary_words_tree = Tree.AVLTree() secondary_words_tree.insert_array(mongo_dictionary[i].secondary_words) excluding_words_tree = Tree.AVLTree() excluding_words_tree.insert_array(mongo_dictionary[i].excluding_words) dictionary.append( Utils.Dictionary(mongo_dictionary[i].type_name, key_words_tree, secondary_words_tree, excluding_words_tree)) # print('Write a text: ') # text_to_classify = input().lower() text_to_classify = Utils.delete_text_punctuation(text_to_classify) #the algorithm #while text_to_classify != '1' and text_to_classify != 'exit': key_words_value = [] secondary_words_value = [] excluding_words_value = [] found_1words, found_2words, found_exwords = [], [], [] empty_words_tree = Tree.AVLTree() empty_words_tree.insert_array(DB.GET_empty_words_from_DB()) for sport in dictionary: #print('----------------------------------------------------------------------',sport.type_name) value, words = sport.key_words.find_words_in_text( text_to_classify, word_mark=1, empty_words_tree=empty_words_tree) key_words_value.append(value) found_1words.append(words) value, words = sport.secondary_words.find_words_in_text( text_to_classify, word_mark=0.25, empty_words_tree=empty_words_tree) secondary_words_value.append(value) found_2words.append(words) value, words = sport.excluding_words.find_words_in_text( text_to_classify, word_mark=1.5, empty_words_tree=empty_words_tree) excluding_words_value.append(value * -1) found_exwords.append(words) #print(key_words_value) #print(secondary_words_value[:]) #print(excluding_words_value[:]) i = 0 for exclude_value in excluding_words_value: key_words_value[i] -= exclude_value secondary_words_value[i] -= exclude_value * 2 i += 1 max_values_key = heapq.nlargest( 1, key_words_value) # se escoge las dos mas altas max_values_secondary = heapq.nlargest( 3, secondary_words_value) # se escoge las dos mas altas i = 0 # print('MAX VALUES KEY') if max_values_key[i] != 0: #words_repetidas_key = set(words_repetidas_key) print('Segun primary words:', Utils.get_data_name(key_words_value.index(max_values_key[0])), found_1words[key_words_value.index(max_values_key[0])]) else: print('Ninguna key word encontrada') # for _ in max_values_key: # print("->", max_values_key[i], "puntos -> ", # Utils.get_data_name(key_words_value.index(max_values_key[i]))) # i += 1 # i = 0 #print('MAX VALUES SECONDARY') if max_values_secondary[i] != 0: #words_repetidas_secondary = set(words_repetidas_secondary) print( 'Segun secondary words:', Utils.get_data_name( secondary_words_value.index(max_values_secondary[0])), found_2words[secondary_words_value.index(max_values_secondary[0])]) else: print('Ninguna secondary word encontrada') # ok = 1 # for _ in max_values_secondary: # if ok == 1: # print("|") # print("v", "%0.2f" % max_values_secondary[i], "puntos -> ", # Utils.get_data_name(secondary_words_value.index(max_values_secondary[i]))) # if i > 0: # if Utils.get_data_name(secondary_words_value.index(max_values_secondary[i])) == Utils.get_data_name(secondary_words_value.index(max_values_secondary[i-1])): # ok = 0 # i += 1 # print('Write a text: ') # input_value = input().lower() # input_value = Utils.delete_text_punctuation(input_value) end = time.time() print('Ha tardo:', end - start, 'seg')