Пример #1
0
def create_vocabulary(row, voc):
    global voc_global
    try:
        text = utils.cleaner(row['intro'] + row['plot'])
    except:
        return
    for elem in text:
        if elem not in voc:
            voc[elem] = voc_global
            voc_global += 1
Пример #2
0
def search_engine_1_apply(row, word_dict, query, vocabulary):
    cont = 0
    cleaned_query_1 = utils.cleaner(query)
    for elem in cleaned_query_1:
        elem = vocabulary[elem]
        if (elem in word_dict):
            if (row['film_id'] in word_dict[elem]):
                cont += 1
    if cont == len(cleaned_query_1):
        return 1
    return 0
Пример #3
0
def cleaner_tsv_files_update_index(
        vocab, index, path_tsv):  #input: the path where the file tsv are
    for file in sorted(os.listdir(path_tsv)):
        #print('sto pulendo il file: {}'.format(file))
        if file.startswith("article_"):
            with open(path_tsv + file, "r", encoding="utf-8") as f:
                text = f.read().lower().split("\t")  # read the file
                clean_text_words = cleaner(" ".join(
                    text[1:3]))  #clean the text
                updateIndex(vocab, index, clean_text_words,
                            file)  #--> updateIndex
Пример #4
0
def cleaner_tsv_files_update_index_2(
        vocab, index1, index2,
        path_tsv):  #input: the path where the file tsv are

    for file in sorted(os.listdir(path_tsv)):

        if file.startswith("article_"):
            with open(path_tsv + file, "r", encoding="utf-8") as f:
                text = f.read().lower(
                )  # read the file and convert in lower case
                clean_text_words = cleaner(text)  # Clean the text
                updateIndex_2(vocab, index1, index2, clean_text_words,
                              file)  #--> updateIndex
Пример #5
0
def main():
    load_bank_data()
    cleaner()
    header()
    account_auth = auth_account()

    if account_auth:
        cleaner()
        header()
        welcome(account_auth)

        while True:
            option_typed = get_menu_options_typed(account_auth)

            do_operation(option_typed, account_auth)

            print()
            loop_typed = input('Deseja sair da conta S/n? ')
            if not loop_typed == 'n':
                print('Bye')
                break
    else:
        print('Conta inválida')
Пример #6
0
def create_inverted_dict(row, inverted_dict, vocabulary):
    film_id = row['film_id']
    try:
        text = utils.cleaner(row['intro'] + row['plot'])
    except Exception as e:
        return 0
    for w in text:
        w = vocabulary[w]
        if w not in inverted_dict:
            inverted_dict[w] = {film_id : 1}
        else:
            if film_id not in inverted_dict[w]:
                inverted_dict[w][film_id] = 1
            else:
                inverted_dict[w][film_id] += 1
    return len(text)
Пример #7
0
def create_vocabulary(
    vocab, path_tsv
):  #input: the (string) path to the folder that contains the tsv files
    for file in sorted(os.listdir(path_tsv)):  #iteration over files
        if file.startswith("article_"):  #check file is an article
            with open(path_tsv + file, "r", encoding="utf-8") as f:
                text = f.read().lower().split(
                    "\t"
                )  # read the file and trasform it in lowercase (to avoid to have to words in the case Tree and tree)
                words = cleaner(
                    " ".join(text)
                )  # Let's do the preprocessing for the inverted index for the current file
                for w in words:
                    if vocab.get(
                            w) == None:  #if the word is not in the vocabulary
                        vocab[w] = len(
                            vocab
                        )  # add it as key, and as value put the len of the vocabulary at that oment (so it is univocal),
Пример #8
0
    def tokens_to_regex(self, toClean):
        keys = self.tokens.keys()
        isExcept = False
        for key in keys:
            val = self.tokens[key]
            print("Processed TOKENS", val)
            #complejos {} [a{b{c}}] a|b[c]
            reduced = utils.complex_operators_eval(val)

            while reduced.find("}") > -1 or reduced.find("]") > -1:
                reduced = utils.complex_operators_eval(reduced)
            #simples |
            translated = utils.simple_operators(reduced)
            #identificar variables (letters)*|(digits)*
            identified, isExcept = utils.identifier(translated, self.characters)
            clean = utils.cleaner(identified, key, toClean)
            

            
            #sentence = utils.evaluate_characters(separated, self.characters, True)
            print("Final TOKEN ", clean)
            self.tokens[key] = {"token":clean, "isExcept": isExcept}