def create_vocabulary(row, voc): global voc_global try: text = utils.cleaner(row['intro'] + row['plot']) except: return for elem in text: if elem not in voc: voc[elem] = voc_global voc_global += 1
def search_engine_1_apply(row, word_dict, query, vocabulary): cont = 0 cleaned_query_1 = utils.cleaner(query) for elem in cleaned_query_1: elem = vocabulary[elem] if (elem in word_dict): if (row['film_id'] in word_dict[elem]): cont += 1 if cont == len(cleaned_query_1): return 1 return 0
def cleaner_tsv_files_update_index( vocab, index, path_tsv): #input: the path where the file tsv are for file in sorted(os.listdir(path_tsv)): #print('sto pulendo il file: {}'.format(file)) if file.startswith("article_"): with open(path_tsv + file, "r", encoding="utf-8") as f: text = f.read().lower().split("\t") # read the file clean_text_words = cleaner(" ".join( text[1:3])) #clean the text updateIndex(vocab, index, clean_text_words, file) #--> updateIndex
def cleaner_tsv_files_update_index_2( vocab, index1, index2, path_tsv): #input: the path where the file tsv are for file in sorted(os.listdir(path_tsv)): if file.startswith("article_"): with open(path_tsv + file, "r", encoding="utf-8") as f: text = f.read().lower( ) # read the file and convert in lower case clean_text_words = cleaner(text) # Clean the text updateIndex_2(vocab, index1, index2, clean_text_words, file) #--> updateIndex
def main(): load_bank_data() cleaner() header() account_auth = auth_account() if account_auth: cleaner() header() welcome(account_auth) while True: option_typed = get_menu_options_typed(account_auth) do_operation(option_typed, account_auth) print() loop_typed = input('Deseja sair da conta S/n? ') if not loop_typed == 'n': print('Bye') break else: print('Conta inválida')
def create_inverted_dict(row, inverted_dict, vocabulary): film_id = row['film_id'] try: text = utils.cleaner(row['intro'] + row['plot']) except Exception as e: return 0 for w in text: w = vocabulary[w] if w not in inverted_dict: inverted_dict[w] = {film_id : 1} else: if film_id not in inverted_dict[w]: inverted_dict[w][film_id] = 1 else: inverted_dict[w][film_id] += 1 return len(text)
def create_vocabulary( vocab, path_tsv ): #input: the (string) path to the folder that contains the tsv files for file in sorted(os.listdir(path_tsv)): #iteration over files if file.startswith("article_"): #check file is an article with open(path_tsv + file, "r", encoding="utf-8") as f: text = f.read().lower().split( "\t" ) # read the file and trasform it in lowercase (to avoid to have to words in the case Tree and tree) words = cleaner( " ".join(text) ) # Let's do the preprocessing for the inverted index for the current file for w in words: if vocab.get( w) == None: #if the word is not in the vocabulary vocab[w] = len( vocab ) # add it as key, and as value put the len of the vocabulary at that oment (so it is univocal),
def tokens_to_regex(self, toClean): keys = self.tokens.keys() isExcept = False for key in keys: val = self.tokens[key] print("Processed TOKENS", val) #complejos {} [a{b{c}}] a|b[c] reduced = utils.complex_operators_eval(val) while reduced.find("}") > -1 or reduced.find("]") > -1: reduced = utils.complex_operators_eval(reduced) #simples | translated = utils.simple_operators(reduced) #identificar variables (letters)*|(digits)* identified, isExcept = utils.identifier(translated, self.characters) clean = utils.cleaner(identified, key, toClean) #sentence = utils.evaluate_characters(separated, self.characters, True) print("Final TOKEN ", clean) self.tokens[key] = {"token":clean, "isExcept": isExcept}