Exemplo n.º 1
0
    def _send_qubits(self):
        print("Sending {} qubits...".format(self.N))
        for i in range(0, self.N):

            # Generate a key bit
            k = random.randint(0, 1)
            self.raw_key.append(k)
            chosen_basis = random.randint(0, 1)
            self.basis_list.append(chosen_basis)

            # Create a qubit
            q = qubit(self.cqc)

            # Encode the key in the qubit
            if k == 1:
                q.X()
            # Encode in H basis if basis = 1
            if chosen_basis == 1:
                q.H()

            self.cqc.sendQubit(q, "Eve")
            qubit_received = communication.receive_message(
                self.cqc, self.receiver_pkey)
            print_progress_bar(i, self.N - 1)

        done_receiving = communication.receive_message(self.cqc,
                                                       self.receiver_pkey)
        assert done_receiving == 'DONE'
    def normalize_audio(self):
        print('Start normalizing audio files...')

        count_of_records = count_files_in_dataset_dir(self.dataset_dir, file_format=self.format)
        print('Number of files to process: ', count_of_records)

        time_start = datetime.datetime.now()
        record_idx = 0
        print_progress_bar(iteration=record_idx, total=count_of_records,
                           prefix='{}/{}'.format(record_idx, count_of_records),
                           suffix='complete')

        for root, dirs, files in sorted(walk(self.dataset_dir)):
            for audio_file in files:
                if audio_file.endswith(self.format):
                    path_to_file = join(root, audio_file)
                    sound = AudioSegment.from_file(path_to_file, self.format)
                    normalized_sound = self.__match_target_amplitude(sound, self.level)
                    self.__save_audio(normalized_sound, path_to_file, self.format, root_dir=self.dataset_dir)

                    record_idx += 1
                    print_progress_bar(iteration=record_idx, total=count_of_records,
                                       prefix='{}/{}'.format(record_idx, count_of_records),
                                       suffix='complete')
        print('All audio files successfully normalized and saved')

        time_end = datetime.datetime.now()
        print('Elapsed time: ', time_end-time_start)
Exemplo n.º 3
0
 def generate_sequence(self):
     print_progress_bar(0, self.n, prefix='Progress:', suffix='Complete', length=50)
     for n_iter in range(self.n):
         self.generate_next_x()
         line = self.x_arr[-self.s:]
         tmp = ''.join(str(x) for x in line)
         new_y = int(tmp, 2)
         self.y_arr.append(new_y)
         print_progress_bar(n_iter + 1, self.n, prefix='Progress:', suffix='Complete', length=50)
Exemplo n.º 4
0
def load_voxceleb_features(dataset_dir, list_path, num_timesteps, debug_mode):
    debug_break_number = 1000

    print('Started loading features from files...')
    time_start = datetime.datetime.now()

    f = open(list_path)
    f_list = list(f)
    f.close()

    features = []
    labels = []
    names_list = []

    num_files = len(f_list)
    print('Files to process: ', num_files)
    file_idx = 0
    print_progress_bar(iteration=file_idx,
                       total=num_files,
                       prefix='{}/{}'.format(file_idx, num_files),
                       suffix='complete')

    for line in f_list:
        path, label = line.rstrip().split(' ')
        path = path[:-4] + '.npy'
        path = join(dataset_dir, path)
        label = int(label) - 1  # in Voxceleb classes starts from 1, shift to 0

        timestep_samples = load_feature_from_file(path, num_timesteps)
        stretched_label = np.full(timestep_samples.shape[0], label)
        stretched_name = np.full(timestep_samples.shape[0], line)

        if timestep_samples.shape[0] != 0:
            features.extend(timestep_samples)
            labels.extend(stretched_label)
            names_list.extend(stretched_name)

        file_idx += 1
        print_progress_bar(iteration=file_idx,
                           total=num_files,
                           prefix='{}/{}'.format(file_idx, num_files),
                           suffix='complete')
        if debug_mode and file_idx == debug_break_number:
            print('\nDebug mode. Interrupted on ', file_idx)
            break

    print('Dataset successfully processed and saved')
    time_end = datetime.datetime.now()
    print('Elapsed time: ', time_end - time_start)

    features = np.array(features)
    labels = np.array(labels)
    return features, labels, names_list
Exemplo n.º 5
0
def task_apply_theme():
    from subprocess import call

    command = (
        'am start -n com.android.thememanager/com.android.thememanager.ApplyThemeForScreenshot -e "'
        + "theme_file_path" + '" "' + THEME_DEST_LOC + THEME_OUT_NAME + '" ' +
        ' -e "api_called_from" "ThemeEditor_' + THEME_VERSION + '"')

    progress = 0
    print_progress_bar(progress, 7, suffix="", length=50)

    result = call([ADB, "shell", command], stdout=devnull, stderr=devnull)
    if result != 0:
        print(
            '\033[91mno devices/emulators found, try to use task "Connect to ADB"\033[0m'
        )
        print_progress_bar(progress,
                           7,
                           suffix=f"Error!" + (" " * 20),
                           length=50)

    if result == 0:
        while progress < 7:
            print_progress_bar(progress,
                               7,
                               suffix=f"Applying!" + (" " * 20),
                               length=50)
            time.sleep(1)
            progress += 1
        print_progress_bar(progress,
                           7,
                           suffix=f"Complete!" + (" " * 20),
                           length=50)
    return result
Exemplo n.º 6
0
def audio_mixup_augmentation(paths_list, cleared_audio_dir, alpha, num_classes,
                             sample_rate, num_coeffs, num_timesteps,
                             num_to_mix):
    print('Started data augmentation using audio mixup...')
    time_start = datetime.datetime.now()
    mixup_features = []
    mixup_labels = []

    paths_list = list(dict.fromkeys(paths_list))  # remove all duplicates
    num_records = len(paths_list)
    np.random.shuffle(paths_list)
    audio, labels = load_cleared_audio(cleared_audio_dir, paths_list)
    labels = to_categorical(labels, num_classes)

    print('Augmentation:')
    num_same_class = 0
    for idx in range(num_records - num_to_mix + 1):
        for mix_idx in range(num_to_mix - 1):
            lam = np.random.beta(a=alpha, b=alpha)
            audio_size = len(audio[idx]) if len(audio[idx]) < len(
                audio[idx + mix_idx + 1]) else len(audio[idx + mix_idx + 1])
            mixed_audio = lam * audio[idx][:audio_size] + (
                1 - lam) * audio[idx + mix_idx + 1][:audio_size]
            mixed_lab = lam * labels[idx] + (1 - lam) * labels[idx + mix_idx +
                                                               1]

            extractor = FeatureExtractor(mixed_audio, sample_rate, num_coeffs)
            mixed_feat = extractor.extract_log_mel_filterbank_energies()
            timestep_samples = compose_samples_with_timesteps(
                mixed_feat, num_timesteps)
            mixup_features.extend(timestep_samples)
            stretched_label = np.tile(mixed_lab,
                                      (timestep_samples.shape[0], 1))
            mixup_labels.extend(stretched_label)

            if np.array_equal(labels[idx], labels[idx + mix_idx + 1]):
                num_same_class += 1

        print_progress_bar(idx,
                           num_records - num_to_mix,
                           prefix='{}/{}'.format(idx + 1,
                                                 num_records - num_to_mix + 1),
                           suffix='complete')

    time_end = datetime.datetime.now()
    print('Elapsed time: ', time_end - time_start)
    ratio = num_same_class / (num_to_mix * num_records)
    print('%.2f%% are mixed from the same class,\
          \n%.2f%% are mixed from different classes' % (ratio, 1 - ratio))
    return np.array(mixup_features), np.array(mixup_labels)
Exemplo n.º 7
0
    def __init__(self, c, k, w, p_arr, x_start_arr, m_arr, j_arr, q, j):
        self.c = c
        self.k = k
        self.w = w
        self.p_arr = p_arr
        self.x_start_arr = x_start_arr
        self.m_arr = [int(m) for m in m_arr]
        self.j_arr = [[int(item) - 1 for item in ar] for ar in j_arr]
        self.q = int(q)
        self.j = [[int(j) for j in str(item)] for item in j]

        print_progress_bar(0,
                           self.k,
                           prefix='Progress step 1:',
                           suffix='Complete',
                           length=50)
        self.lfsr_generated_list = []
        for i in range(self.k):
            self.lfsr_generated_list.append(
                self.generate_lfsr_list(
                    int(self.c * self.w / self.p_arr[i] + 1), self.p_arr[i],
                    self.m_arr[i], self.j_arr[i], self.x_start_arr[i]))
            print_progress_bar(i + 1,
                               self.k,
                               prefix='Progress step 1:',
                               suffix='Complete',
                               length=50)

        print_progress_bar(0,
                           self.c,
                           prefix='Progress step 2:',
                           suffix='Complete',
                           length=50)
        self.y_arr = []
        for c_iter in range(self.c):
            result = []
            for w_iter in range(self.w):
                new_x = 0
                bits = [
                    item[self.w * c_iter + w_iter]
                    for item in self.lfsr_generated_list
                ]
                for q_iter in range(self.q):
                    need_bits = [
                        1 if self.j[q_iter][idx] == 0 else value
                        for idx, value in enumerate(bits)
                    ]
                    qq = need_bits[0]
                    for bit in need_bits[1:]:
                        qq = qq & bit
                    new_x = new_x ^ qq
                result.append(new_x)
            self.y_arr.append(int(''.join([str(it) for it in result]), 2))
            print_progress_bar(c_iter + 1,
                               self.k,
                               prefix='Progress step 2:',
                               suffix='Complete',
                               length=50)
Exemplo n.º 8
0
def concatenateImages(img_files, grid_size=None, tile_size=None):
    '''
    Concatenate images into one single (large) image
    @params:
        img_files - list of image file paths
        grid_size - tuple holding number of tiles by width and height
        tile_size - tuple specifying (width, height) of a tile # all tiles are the same size
    '''

    if (tile_size == None):
        temp_img = Image.open(
            img_files[0])  # open first tile image just to get its size
        #all tiles are the same size as the first image tile
        tile_width = temp_img.size[0]
        tile_height = temp_img.size[0]

    if (grid_size == None):
        grid_size_eq = math.floor(
            math.sqrt(len(img_files))
        )  # equally divide tiles by width and height by calculating square root of total number of tiles
        grid_size = (grid_size_eq, grid_size_eq)

    total_width = grid_size[0] * tile_width  # in pixels
    total_height = grid_size[1] * tile_height  # in pixels

    new_im = Image.new('RGB', (total_width, total_height))
    #14720px WIDTH WORKING!
    #18560px WIDTH WORKING!
    #26560px WIDTH WORKING!

    progressCounter = 0
    totalIterations = grid_size[0] * grid_size[1]

    for y in range(0, total_height, tile_height):
        for x in range(0, total_width, tile_width):
            new_im.paste(Image.open(img_files[progressCounter]), (x, y))
            progressCounter += 1
            print_progress_bar(progressCounter, totalIterations, 'Progress:',
                               'completed.')

    output_path = 'test_' + str(
        (tile_width,
         tile_height)) + 'px_' + str(grid_size) + 'gridSize_' + str(
             datetime.datetime.now()) + '.jpg'
    new_im.save(output_path)
    print(f'Finished. Saved to {output_path}')
Exemplo n.º 9
0
 def sort(self, documents_cleaner):
     """
     Tri des l'ensemble de chaque mot de l'index par rapport à un score
     Chaque ensemble deviendra une liste triée en fonction des scores
     @params:
         documents_cleaner - Required  : Les documents à trier (Dict)
     """
     list_files_content = []
     compteur = 1
     taille = len(documents_cleaner.items())
     # On recupere tous les contenus de tous les fichiers
     for file, content in documents_cleaner.items():
         list_files_content.append(content)
         #Progress bar
         progress_bar.print_progress_bar(
             compteur,
             taille,
             prefix='Récupération des fichiers : ' + str(compteur) + '/' +
             str(taille),
             suffix='')
         compteur = compteur + 1
     compteur = 1
     taille = len(self.index.items())
     #Pour chaque mot de chaque fichier on calcule le score associé
     for word, files in self.index.items():
         dict_word = dict()
         for file in files:
             score_word_file = tf_idf.TF_frequence_brute(
                 word, documents_cleaner[file])
             dict_word[file] = score_word_file
         #On tri par rapport au score
         list_word = sorted(dict_word.items(),
                            key=lambda t: t[1],
                            reverse=True)
         # L'index est devient word [('file_name' : score) , ...]
         # On a ainsi un index où les fichiers sont triés par ordre décroissant en fonction du mot et des autres fichiers
         self.index[word] = list_word
         #Progress bar
         progress_bar.print_progress_bar(
             compteur,
             taille,
             prefix='Tri des fichiers (par score de mot) : ' +
             str(compteur) + '/' + str(taille),
             suffix='')
         compteur = compteur + 1
Exemplo n.º 10
0
def collection_of_real_words(collection_of_possible_words):
    """
    enumerates through each possible word and searches if it exists in a list of words found in "/usr/share/dict/words"
    :param collection_of_possible_words: an array of string elements containing words of possible english words
    :return: an array of the matches found in "/usr/share/dict/words"
    """
    str_words = collect_words_in_file("/usr/share/dict/words")
    arr_words = str_words.split('\n')

    len_possible_words = len(collection_of_possible_words)
    words_found = []
    progress_bar.print_progress_bar(0, len_possible_words, 'Progress:', 'Complete', 5)

    for i, permutation in enumerate(collection_of_permutations):
        if find_word_in_collection(permutation, arr_words) and permutation != str_word_term:
            words_found.append(permutation)
        progress_bar.print_progress_bar(i + 1, len_possible_words, 'Progress:',
                                        'of {} iterations'.format(len_possible_words), 5)

    return words_found
Exemplo n.º 11
0
def convert_url(urls):
    """
    Fonction permettant d'avoir les documents sous forme de dictionnaire {'file_name':'file_content'}
	Ici le file_content est normalisé et nettoyé
	On affiche en plus la progression des fichiers nettoyés
    @params:
        urls  - Required : Les urls des fichiers
    """
    res = dict()
    compteur = 1
    taille = len(urls)
    for url in urls:
        #On associer à l'url son contenu nettoyé
        res[url] = cleaner_link(url)
        progress_bar.print_progress_bar(compteur,
                                        taille,
                                        prefix='Nettoyage des fichier : ' +
                                        str(compteur) + '/' + str(taille),
                                        suffix='')
        compteur = compteur + 1
    return res
Exemplo n.º 12
0
def feature_mixup_augmentation(features, labels, alpha, num_to_mix):
    print('Started data augmentation using feature mixup...')
    time_start = datetime.datetime.now()

    mixup_features = []
    mixup_labels = []
    num_records = len(labels)

    indices = np.arange(num_records)
    np.random.shuffle(indices)
    features = features[indices]
    labels = labels[indices]

    num_same_class = 0
    for idx in range(num_records - num_to_mix + 1):
        for mix_idx in range(num_to_mix - 1):
            lam = np.random.beta(a=alpha, b=alpha)
            mixed_feat = lam * features[idx] + (
                1 - lam) * features[idx + mix_idx + 1]
            mixed_lab = lam * labels[idx] + (1 - lam) * labels[idx + mix_idx +
                                                               1]
            mixup_features.append(mixed_feat)
            mixup_labels.append(mixed_lab)

            if np.array_equal(labels[idx], labels[idx + mix_idx + 1]):
                num_same_class += 1

        print_progress_bar(idx,
                           num_records - num_to_mix,
                           prefix='{}/{}'.format(idx + 1,
                                                 num_records - num_to_mix + 1),
                           suffix='complete')

    ratio = num_same_class / (num_to_mix * num_records)
    print('%.2f%% are mixed from the same class,\
          \n%.2f%% are mixed from different classes' % (ratio, 1 - ratio))

    time_end = datetime.datetime.now()
    print('Elapsed time: ', time_end - time_start)
    return np.array(mixup_features), np.array(mixup_labels)
Exemplo n.º 13
0
def push(src_file, dst_path):
    result = subprocess.call([ADB, "devices"], stderr=ignore, stdout=ignore)
    if result != 0:
        print(
            '\033[91mno devices/emulators found, try to use task "Connect to ADB"\033[0m'
        )
        return result

    progress = 0
    print_progress_bar(progress, 1, suffix="", length=50)
    result = subprocess.call([ADB, "push", src_file, dst_path],
                             stderr=ignore,
                             stdout=ignore)

    if result != 0:
        print_progress_bar(progress,
                           1,
                           suffix=f"Error!" + (" " * 20),
                           length=50)
        print(
            f"failed to push {src_file} to directory {dst_path} with result code {result}\n"
        )
        return result
    if result == 0:
        progress += 1
        print_progress_bar(progress,
                           1,
                           suffix=f"Complete!" + (" " * 20),
                           length=50)

    return result
def _extract_features(corpus_dir, file_extension, excluded_paths, features,
                      output_file):
    file_names = _get_filenames(corpus_dir, file_extension, excluded_paths)
    feature_tuples = [(name, decorated_features[name]) for name in features]
    text_to_features = {}  #Associates file names to their respective features
    print('Extracting features from .' + file_extension + ' files in ' +
          YELLOW + corpus_dir + RESET)

    #Feature extraction
    file_no = 1
    for file_name in file_names:
        text_to_features[file_name] = {}

        file_text = file_parsers[file_extension](file_name)

        for feature_name, func in feature_tuples:
            score = func(file_text, file_name)
            text_to_features[file_name][feature_name] = score
            if output_file is None:
                print(file_name + ', ' + str(feature_name) + ', ' + GREEN +
                      str(score) + RESET)

        if output_file is not None:
            print_progress_bar(file_no,
                               len(file_names),
                               prefix='Progress',
                               suffix='(%d of %d files)' %
                               (file_no, len(file_names)))
            file_no += 1

    clear_cache(tokenize_types, debug_output)

    if output_file is not None:
        print(
            'Feature mining complete. Attempting to write feature results to "'
            + YELLOW + output_file + RESET + '"...')
        with open(output_file, 'wb') as pickle_file:
            pickle_file.write(pickle.dumps(text_to_features))
        print(GREEN + 'Success!' + RESET)
def dicoSimilaire(dicoDocuments):
    print("Creation du dico des pages similaires")
    dicoSim = dict()
    compteur = 1
    taille = len(dicoDocuments.items())
    print("Traitement pages similaires\n")
    for lien1, texte1 in dicoDocuments.items():
        for lien2, texte2 in dicoDocuments.items():
            if (lien1 != lien2):
                d = textdistance.hamming.similarity(texte1, texte2)
                s = dicoSim.setdefault(lien1, set())
                if (
                        d > 300
                ):  # si leur similarité est supérieur à 300 on ajoute la page comme valeur dans le set
                    s.add(lien2)
        progress_bar.print_progress_bar(
            compteur,
            taille,
            prefix='Vérifie si les fichiers sont dans le dico : ' +
            str(compteur) + '/' + str(taille),
            suffix='')
        compteur = compteur + 1
    return (dicoSim)
Exemplo n.º 16
0
 def build(self, documents):
     """
     Construction de l'index inversé
     Cette fonction va permettre d'ajouter dans l'index inversé le mot et son fichier.
     @params:
         documents - Required  : Les documents sur lesquels ont veux contruire l'index inversé (Dict)
     """
     compteur = 1
     taille = len(documents)
     for file_name, file_content in documents.items():
         self.add(file_name, file_content)
         progress_bar.print_progress_bar(
             compteur,
             taille,
             prefix='Indexation des fichiers : ' + str(compteur) + '/' +
             str(taille),
             suffix='')
         compteur = compteur + 1
     print(
         "\nTri des fichiers (trie décroissant des scores des fichiers en fonction du mot indexe) en cours..."
     )
     self.sort(documents)
     print("\nTri des fichiers terminé")
def supressionPage(dico):
    fichiers = os.listdir('./pages_web/')
    taille = len(fichiers)
    compteur = 1
    for fichier in fichiers:
        try:
            #print(dico[fichier]); #KeyError
            #si dans le dico alors supprimé du dico les pages dans son set
            #set de nom fichier (string)
            setfichiers = dico[fichier]
            #print("clé trouvé")
            for f in setfichiers:
                dico.pop(f)
                #supprime clé nom du fichier du dico
        except KeyError:
            #print("pas de clé trouvé donc suppression de la page")
            os.remove('./pages_web/' + fichier)
        progress_bar.print_progress_bar(
            compteur,
            taille,
            prefix='Vérifie si les fichiers sont dans le dico : ' +
            str(compteur) + '/' + str(taille),
            suffix='')
        compteur = compteur + 1
Exemplo n.º 18
0
def load_cleared_audio(dataset_dir, files_list):
    print('Loading cleared audio files...')
    audio_all = []
    labels = []
    file_idx = 0
    num_files = len(files_list)
    print_progress_bar(iteration=file_idx,
                       total=num_files,
                       prefix='{}/{}'.format(file_idx, num_files),
                       suffix='complete')
    for line in files_list:
        path, label = line.rstrip().split(' ')
        path = path[:-4] + '.npy'  # eliminate .wav and add .npy
        path_to_file = join(dataset_dir, path)
        audio = np.load(path_to_file)
        audio_all.append(audio)
        labels.append(int(label) -
                      1)  # in Voxceleb classes starts from 1, shift to 0
        file_idx += 1
        print_progress_bar(iteration=file_idx,
                           total=num_files,
                           prefix='{}/{}'.format(file_idx, num_files),
                           suffix='complete')
    return np.array(audio_all), np.array(labels)
Exemplo n.º 19
0
def push(directory, cleanup=False):
    items = glob(directory + "/*")
    changed = [
        os.path.relpath(path, directory) for path in items
        if storage.is_path_changed(path)
    ]

    if len(changed) < 1:
        print_progress_bar(1, 1, suffix='Complete!', length=50)
        return 0

    dst_root = get_push_pack_directory()
    if dst_root is None:
        return -1

    result = subprocess.call([make_config.get_adb(), "devices"],
                             stderr=ignore,
                             stdout=ignore)
    if result != 0:
        print(
            "\033[91mno devices/emulators found, try to use task \"Connect to ADB\"\033[0m"
        )
        return result

    dst_root = dst_root.replace("\\", "/")
    if not dst_root.startswith("/"):
        dst_root = "/" + dst_root

    src_root = directory.replace("\\", "/")

    progress = 0
    for filename in changed:
        src = src_root + "/" + filename
        dst = dst_root + "/" + filename
        print_progress_bar(progress,
                           len(changed),
                           suffix=f'Pushing {filename}' + (" " * 20),
                           length=50)
        subprocess.call([make_config.get_adb(), "shell", "rm", "-r", dst],
                        stderr=ignore,
                        stdout=ignore)
        result = subprocess.call([make_config.get_adb(), "push", src, dst],
                                 stderr=ignore,
                                 stdout=ignore)
        progress += 1

        if result != 0:
            print(f"failed to push to directory {dst_root} with code {result}")
            return result

    print_progress_bar(progress,
                       len(changed),
                       suffix=f'Complete!' + (" " * 20),
                       length=50)
    storage.save()
    return result
Exemplo n.º 20
0
def install(arch="arm", reinstall=False):
    if not reinstall and check_installed(arch):
        print("toolchain for " + arch +
              " is already installed, installation skipped")
        return True
    else:
        ndk_path = get_ndk_path()
        if ndk_path is None:
            from urllib import request
            print("failed to get ndk path")
            ans = input("download ndk? (Y/N) ")
            if ans.lower() == "y":
                archive_path = make_config.get_path("toolchain\\temp\\ndk.zip")
                makedirs(dirname(archive_path), exist_ok=True)

                if not isfile(archive_path):
                    url = "https://dl.google.com/android/repository/android-ndk-r16b-windows-x86_64.zip"
                    with request.urlopen(url) as response:
                        with open(archive_path, 'wb') as f:
                            info = response.info()
                            length = int(info["Content-Length"])

                            downloaded = 0
                            while True:
                                buffer = response.read(8192)
                                if not buffer:
                                    break

                                downloaded += len(buffer)
                                f.write(buffer)

                                print_progress_bar(
                                    downloaded,
                                    length,
                                    suffix='Downloading...'
                                    if downloaded < length else "Complete!",
                                    length=50)

                print("extracting ndk...")
                extract_path = make_config.get_path("toolchain\\temp")
                with ZipFile(archive_path, 'r') as archive:
                    archive.extractall(extract_path)

                ndk_path = search_ndk_path(extract_path, contains_ndk=True)
            else:
                print("aborting native compilation")
                return False

        print("installing...")
        result = subprocess.call([
            "python",
            join(ndk_path, "build", "tools", "make_standalone_toolchain.py"),
            "--arch",
            str(arch), "--install-dir",
            make_config.get_path("toolchain\\ndk\\" + str(arch)), "--force"
        ])

        if result == 0:
            open(
                make_config.get_path("toolchain\\ndk\\.installed-" +
                                     str(arch)), 'tw').close()
            print("removing temp files...")
            clear_directory(make_config.get_path("toolchain\\temp"))
            print("done!")
            return True
        else:
            print("installation failed with result code:", result)
            return False
Exemplo n.º 21
0
        if current_file_name.endswith('.' + 'tess')
    }))

counter = 1
diff1 = 0
diff2 = 0
diff3 = 0
for file_name in file_names:
    file_text = parse_tess(file_name)
    diff1 += 1 if before_params.tokenize(file_text) != after_params.tokenize(
        file_text) else 0
    diff2 += 1 if before_params2.tokenize(file_text) != after_params2.tokenize(
        file_text) else 0
    diff3 += 1 if before_params3.tokenize(file_text) != after_params3.tokenize(
        file_text) else 0
    print_progress_bar(counter, len(file_names))
    counter += 1
print('Differences between pickle loads: ' + str(diff1))
print('Differences between default PunktSentenceTokenizers: ' + str(diff2))
print('Differences between trained PunktSentenceTokenizers: ' + str(diff3))
'''
Changing class variables in PunktLanguageVars seems to have no affect on any of the tokenizers before and after
'''
'''
Observe the following three snippets of code

(1)
>>> from nltk.tokenize.punkt import *
>>> s = 'test test test test test. test test test; test test.'
>>> PunktLanguageVars.sent_end_chars = ('.', ';')
>>> p = PunktSentenceTokenizer()
def detect_images(img_paths,
                  save_detected_images=False,
                  detection_threshold=0.5):

    # Define the video stream
    #cap = cv2.VideoCapture(0)  # Change only if you have more than one webcams
    print(f'TensorFlow version {tf.__version__}')

    # What model to download.
    # Models can bee found here: https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md
    # {model name for downloading} {model name} {speed in ms} {detection in COCO measurement units}
    #MODEL_NAME = 'ssd_inception_v2_coco_2017_11_17' # ssd_inception_v2_coco 42ms 24COCO mAP
    #MODEL_NAME = 'ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03' # slower than ssd_inception_v2_coco_2017_11_17 model, same detection #ssd_resnet_50_fpn_coco ☆76ms 	35 COCO mAP
    MODEL_NAME = 'ssdlite_mobilenet_v2_coco_2018_05_09'  # fastest # same detection as ssd_inception_v2_coco_2017_11_17 #ssdlite_mobilenet_v2_coco 27ms	22 COCO mAP[^1]
    #MODEL_NAME = 'faster_rcnn_nas_coco_2018_01_28' # faster_rcnn_nas 1833ms 43 COCO mAP # DOES NOT WORK, it gets killed for some unknown reason

    MODEL_FILE = MODEL_NAME + '.tar.gz'
    DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

    # Path to frozen detection graph. This is the actual model that is used for the object detection.
    PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'

    # List of the strings that is used to add correct label for each box.
    path_to_research_folder = "/home/nikola/Git/models/research/object_detection/data/"
    PATH_TO_LABELS = os.path.join(
        'data', path_to_research_folder + 'mscoco_label_map.pbtxt')

    # Number of classes to detect
    NUM_CLASSES = 90

    # Download Model
    if not os.path.exists(MODEL_FILE):
        print(f"Downloading {MODEL_NAME} model...")
        opener = urllib.request.URLopener()
        opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
        tar_file = tarfile.open(MODEL_FILE)
        for file in tar_file.getmembers():
            file_name = os.path.basename(file.name)
            if 'frozen_inference_graph.pb' in file_name:
                tar_file.extract(file, os.getcwd())
    else:
        print(f"Model {MODEL_NAME} already downloaded")

    # Load a (frozen) Tensorflow model into memory.
    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.compat.v1.GraphDef()
        with tf.io.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

    # Loading label map
    # Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`.  Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine
    label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
    categories = label_map_util.convert_label_map_to_categories(
        label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
    category_index = label_map_util.create_category_index(categories)

    # Helper code
    def load_image_into_numpy_array(image):
        (im_width, im_height) = image.size
        return np.array(image.getdata()).reshape(
            (im_height, im_width, 3)).astype(np.uint8)

    def load_images(img_paths):
        ''' Load images via generator for less memory usage '''

        for img_path in img_paths:
            if not os.path.exists(img_path):
                print(
                    f"File could not be found. Check path and file extension. Entered path is {img_path}"
                )
                exit(0)

            if not os.path.isfile(img_path):
                print(
                    f"File is not a valid file. Check path and file extension. Entered path is {img_path}"
                )
                exit(0)

            #width, height =  img.size[0], img.size[1]
            #print('Frame size: width, height:', width, height)
            yield Image.open(img_path)

    # Detection
    with detection_graph.as_default():
        with tf.compat.v1.Session(graph=detection_graph) as sess:
            for counter, img in enumerate(load_images(img_paths), 1):

                if img is None:
                    print("Image is None")
                    exit(0)

                image_np = load_image_into_numpy_array(img)
                #image_np = load_image_into_numpy_array(image_np)
                #cv2.imshow('Loaded image', image_np)
                #cv2.waitKey(0)

                # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
                image_np_expanded = np.expand_dims(image_np, axis=0)

                # Extract image tensor
                image_tensor = detection_graph.get_tensor_by_name(
                    'image_tensor:0')
                # Extract detection boxes
                boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
                # Extract detection scores
                scores = detection_graph.get_tensor_by_name(
                    'detection_scores:0')
                # Extract detection classes
                classes = detection_graph.get_tensor_by_name(
                    'detection_classes:0')
                # Extract number of detectionsd
                num_detections = detection_graph.get_tensor_by_name(
                    'num_detections:0')
                # Actual detection.
                (boxes, scores, classes, num_detections) = sess.run(
                    [boxes, scores, classes, num_detections],
                    feed_dict={image_tensor: image_np_expanded})
                # Visualization of the results of a detection.
                vis_util.visualize_boxes_and_labels_on_image_array(
                    image_np,
                    np.squeeze(boxes),
                    np.squeeze(classes).astype(np.int32),
                    np.squeeze(scores),
                    category_index,
                    use_normalized_coordinates=True,
                    line_thickness=4,
                    min_score_thresh=.5)

                # Print detected classes (above threshold level) # TODO: Count the same classes
                class_names = [
                    category_index[int(i)]['name'] for i in classes[0]
                ]
                above_threshold_scores = [
                    x for x in scores[0] if x > detection_threshold
                ]
                print(
                    f"Detected classes: {list(zip(class_names, above_threshold_scores))}"
                )

                img_filename_with_ext = img.filename.split('/')[-1]
                filename, file_ext = img_filename_with_ext.split(
                    '.')[0], img.format

                # Print current progress
                print_progress_bar(
                    counter,
                    len(img_paths),
                    prefix=f'Detecting image {img_filename_with_ext}')

                # Display output
                #cv2.imshow(f"{img_filename_with_ext} (press 'q' to exit)", cv2.resize(image_np, (800, 600)))

                # Save output
                if save_detected_images:
                    img_save_path = str(filename + '_detected_output(' +
                                        str(counter) + ').' + file_ext)
                    print(f'Saving detected output image to {img_save_path}')
                    ret = cv2.imwrite(img_save_path, image_np)

                    if ret == False:
                        print(f'Warning. imwrite returned: {ret}')
def grid_search(is_threaded, k, size, alg_import):
    """
  Performs grid search over all hyperparameters for a particular problem. The
  best values for each hyperparam will be printet when finished to be set as default.

  Parameters
  ----------
  is_threaded : Boolean
    Should the grid search be distributed.
  k : Integer
    Condition number to use.
  size : Int
    Size of square A matrix.
  alg_import : Module
    Imported algorithm module to be used.

  Raises
  ------
  exc
    Exception that may occur in a sub process.

  Returns
  -------
  None.

  """

    options = alg_import.get_params_gs()

    # The product object that is returned is a lazy iterator, not a list.
    total = len(list(alg_import.get_params_gs()))
    iteration = 0

    best_values = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    best_fitness = 90000000

    if is_threaded:
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        executor = multiprocessing.Pool(8)
        signal.signal(signal.SIGINT, original_sigint_handler)
        submitted = 1
        futures = dict()

    try:
        if is_threaded:
            for x in options:
                # for GA
                # params = {'size': size, 'k':k, 'mu':x[0], 'sigma':x[1], 'alpha':x[2], 'indpb':x[3], 'tournsize':x[4], 'cxpb':x[5], 'mutpb':x[6]}
                # for GSA
                params = {
                    'size': size,
                    'k': k,
                    'gc': x[0],
                    'gd': x[1],
                    'number_generations': 10,
                    'problems': 0,
                    'cl': 10
                }
                futures[executor.apply_async(
                    _thread_caller, (params, alg_import.Algorithm))] = x
                print(f'\rSubmitted {submitted} / {total} jobs', end='\r')
                submitted += 1
        else:
            for params in options:
                alg = alg_import.Algorithm(problem=1,
                                           mu=params[0],
                                           sigma=params[1],
                                           alpha=params[2],
                                           indpb=params[3],
                                           tournsize=params[4],
                                           cxpb=params[5],
                                           mutpb=params[6],
                                           debug=-1,
                                           size=size,
                                           k=k,
                                           num_particles=50)
                _, fitness, _ = alg.run()

                if fitness <= best_fitness:
                    best_fitness = fitness
                    best_values = params

                print_progress_bar(iteration,
                                   total,
                                   suffix=("Complete--Best fitness: {0:.3f}"
                                           ).format(best_fitness))
                iteration += 1

        if is_threaded:
            for future in futures:
                parameters = futures[future]
                fitness = future.get()
                if fitness <= best_fitness:
                    best_fitness = fitness
                    best_values = parameters

                print_progress_bar(iteration,
                                   total,
                                   suffix=("Complete--Best fitness: {0:.3f}"
                                           ).format(best_fitness))
                iteration += 1
    except KeyboardInterrupt:
        if is_threaded:
            executor.terminate()
    else:
        if is_threaded:
            executor.close()

    if is_threaded:
        executor.join()

    # print("Best values from grid search evaluation is:\n\tMu:%.3f\n\tSigma:%.3f\n\tAlpha:%.3f\n\tIndpb:%.3f\n\tTournsize:%i\n\tCxpb:%.3f\n\tMutpb:%.3f"% best_values)
    print("Best parameters had fitness: %.3f" % (best_fitness))
    for file in files:
        if file.endswith(".epub"):
            time_last = datetime.datetime.now()
            scanned += 1
            filename = file[:-5]
            #print(filename)
            logger.info(filename)
            sourcepath = os.path.join(root, filename)
            if os.path.exists(sourcepath):
                shutil.rmtree(sourcepath)
                logger.info("\tRemoved source files")
            destpath = os.path.join(root, "LaTeX")
            if os.path.exists(destpath):
                shutil.rmtree(destpath)
                logger.info("\tRemoved destination files")
            elapsed = datetime.datetime.now() - time_last
            logger.info("Time: %02d:%02d.%03d" %
                        (elapsed.seconds // 60, elapsed.seconds % 60,
                         elapsed.microseconds // 1000))
            #print(filename + ": %02d:%02d.%03d" % (elapsed.seconds // 60, elapsed.seconds % 60, elapsed.microseconds // 1000))
            progress_bar.print_progress_bar(scanned, total)

elapsed = datetime.datetime.now() - time_first
logger.info("Total Time: %02d:%02d.%03d" %
            (elapsed.seconds // 60, elapsed.seconds % 60,
             elapsed.microseconds // 1000))
print("Total Time: %02d:%02d.%03d" % (elapsed.seconds // 60, elapsed.seconds %
                                      60, elapsed.microseconds // 1000))
print("%d Books Scanned" % scanned)
logger.info("%d Books Scanned" % scanned)
Exemplo n.º 25
0
def setup_alg(options, alg_import):
    """
  Perform setup for a given algorithm.

  Parameters
  ----------
  options : NameSpace
    All user given or default values for setup.
  alg_import : Algorithm import
    The import module of an algorithm file that will be used.

  Returns
  -------
  None.

  """
    if options.perform_grid_search:
        # Perform Grid search to find best hyperparameters to set as default
        grid_search(options.is_threaded, options.k, options.size, alg_import)
    else:
        if options.problems == 0:
            problem_runs = [0]
        elif options.problems == 1:
            problem_runs = [1]
        elif options.problems == 2:
            problem_runs = [0, 1]
        else:
            print("Problems given is not of [0, 1, 2]")
            exit(1)

        total_list_objective_function_counts = []
        for problem in problem_runs:
            print(f"\tRunning problem: {problem + 1}")
            print(f"\tThreading is: {'Enabled' if options.is_threaded else 'Disabled'}")

            if options.use_pred_inputs:
                run_num = 1
                for major_iterations in [500]:
                    options.number_generations = major_iterations
                    log_dict = dict()
                    dis_dict = dict()
                    if options.is_threaded:
                        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
                        executor = multiprocessing.Pool(8)
                        signal.signal(signal.SIGINT, original_sigint_handler)
                        submitted = 1
                        futures = dict()

                    try:
                        # for k in [3, 10, 30, 100, 300, 1000]:
                        #  for n in [2, 5, 10, 20, 50, 100]:
                        k_values = [5, 10, 50]
                        n_values = [5, 10, 50]
                        b_values = [5, 10, 50]
                        m_values = [5, 10, 50]
                        M_values = [5, 10, 50]

                        if problem == 0:
                            total = len(k_values) * len(n_values)
                        else:
                            total = len(b_values) * len(m_values) * len(M_values) * len(n_values)

                        for n in n_values:
                            if problem == 0:
                                for k in k_values:
                                    options.k = k
                                    options.size = n
                                    key = f"{n},{k}"
                                    key_header = ["n", "k"]

                                    if options.is_threaded:
                                        copy_options = copy(options)
                                        futures[executor.apply_async(run_alg, (
                                            problem, vars(copy_options), alg_import.Algorithm))] = key
                                        print(f'\rSubmitted {submitted} / {total} jobs', end='\r')
                                        submitted += 1
                                    else:

                                        loss_values, distance_from_sol, of_counts = run_alg(problem, vars(options),
                                                                                            alg_import.Algorithm)
                                        total_list_objective_function_counts.extend(of_counts)
                                        dis_dict[key] = distance_from_sol

                                        log_dict[key] = loss_values
                                        print_progress_bar(run_num, total)
                                        run_num += 1
                            else:
                                for m in m_values:
                                    for M in M_values:
                                        for b in b_values:
                                            options.ncm = m
                                            options.ncM = M
                                            options.ncb = b
                                            options.size = n
                                            key = f"{n},{m},{M},{b}"
                                            key_header = ["n", "m", "M", "b"]

                                            if options.is_threaded:
                                                copy_options = copy(options)
                                                futures[executor.apply_async(run_alg, (
                                                    problem, vars(copy_options), alg_import.Algorithm))] = key
                                                print(f'\rSubmitted {submitted} / {total} jobs', end='\r')
                                                submitted += 1
                                            else:
                                                loss_values, distance_from_sol, of_counts = run_alg(problem,
                                                                                                    vars(options),
                                                                                                    alg_import.Algorithm)
                                                total_list_objective_function_counts.extend(of_counts)
                                                dis_dict[key] = distance_from_sol
                                                log_dict[key] = loss_values
                                                print_progress_bar(run_num, total)
                                                run_num += 1

                        if options.is_threaded:
                            for future in futures:
                                key = futures[future]
                                try:
                                    loss_values, distance_from_sol, of_counts = future.get()
                                    total_list_objective_function_counts.extend(of_counts)
                                    dis_dict[key] = distance_from_sol
                                    log_dict[key] = loss_values
                                    print_progress_bar(run_num, total)
                                    run_num += 1
                                except Exception as exc:
                                    print('%r generated an exception: %s' % (key, exc))
                                    raise exc
                    except KeyboardInterrupt:
                        if options.is_threaded:
                            executor.terminate()
                    else:
                        if options.is_threaded:
                            executor.close()

                    if options.is_threaded:
                        executor.join()

                    if options.is_plot_exported:
                        plot_multi_data(options.num_particles, log_dict, alg_import)

                    if options.is_csv_exported:
                        save_csv_multi(options.num_particles, log_dict, alg_import, str(options.seed), problem,
                                       key_header)
            else:
                loss_values, distance_from_sol, of_counts = run_alg(problem, vars(options), alg_import.Algorithm)
                total_list_objective_function_counts.extend(of_counts)
                if options.is_csv_exported:
                    if problem == 0:
                        key = key = f"k={options.k}, n={options.size}"
                        dis_dict[key] = distance_from_sol
                        log_dict[key] = loss_values
                    else:
                        key = f"m={options.ncm}, M={options.ncM}, b={options.ncb}, n={options.size}"
                        dis_dict[key] = distance_from_sol
                        log_dict[key] = loss_values

                    save_csv_single(loss_values, options, alg_import, key, problem)
            w = csv.writer(open("type_{}_seed_{}_prob_{}_output.csv".format(alg_import.to_string(), options.seed, problem), "w"))
            for key, val in dis_dict.items():
                w.writerow([key, sorted(val)])

        tmp_counts = np.array(total_list_objective_function_counts)
        print(
            f"{alg_import.to_string()} has objective function counts per iteration of:\n\tMEAN: {tmp_counts.mean()}\n\tSTD. DEV.: {tmp_counts.std()}")
Exemplo n.º 26
0
def create_calib_df(vid_filepath,
                    calib_board: CalibrationBoard,
                    frame_indices=None):
    cap = cv2.VideoCapture(vid_filepath)
    #check video opened successfully
    if not cap.isOpened():
        raise VideoReadError(f"Could not open video: {vid_filepath}")
    #determine which frames to look for checkerboard in
    video_frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)

    if frame_indices is None:
        frame_indices = range(int(video_frame_count))
    #check video has enough frames if frame_indices specified
    last_frame_index = max(frame_indices)
    if video_frame_count < last_frame_index:
        raise InvalidFrameError(
            f"Invalid frame indices. Video: {vid_filepath} only has {video_frame_count} frames. Asked for {last_frame_index}"
        )
    #find calibration board in frames
    n_frames = len(frame_indices)
    print(f"Searching {n_frames} frames for {vid_filepath}")
    n_found = 0
    x = []
    y = []
    frames = []
    point_indices = []
    progress_bar_count = 0
    subpix_criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30,
                       0.1)
    print_progress_bar(
        progress_bar_count,
        n_frames,
        prefix='Progress:',
        suffix=f"Complete (calibration board found in {n_found} frames)",
        length=50)
    for frame_idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if ret:
            # Find the chess board corners
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            ret, corners = cv2.findChessboardCorners(
                gray, calib_board.shape, cv2.CALIB_CB_ADAPTIVE_THRESH +
                cv2.CALIB_CB_FAST_CHECK + cv2.CALIB_CB_NORMALIZE_IMAGE)
            if ret:
                # Refine found corner positions
                n_found += 1
                corners = cv2.cornerSubPix(gray, corners, (3, 3), (-1, -1),
                                           subpix_criteria)
                for corner_idx, corner in enumerate(corners):
                    x.append(corner[0][0])
                    y.append(corner[0][1])
                    frames.append(frame_idx)
                    point_indices.append(corner_idx)
        progress_bar_count += 1
        print_progress_bar(
            progress_bar_count,
            n_frames,
            prefix='Progress:',
            suffix=
            f"Complete (calibration board found in {n_found}/{progress_bar_count} frames)",
            length=50)

    data = {'x': x, 'y': y, 'frame': frames, 'label': point_indices}
    df = pd.DataFrame(data)
    return df
Exemplo n.º 27
0
def random_forest_feature_rankings(data, target, file_names, feature_names,
                                   labels_key):
    rf_trials = 20
    kfold_trials = 20
    splits = 5
    feature_rankings = {
        name: np.zeros(rf_trials * kfold_trials * splits)
        for name in feature_names
    }
    forest_params = {
        'bootstrap': True,
        'class_weight': None,
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'auto',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'min_impurity_split': None,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 10,
        'n_jobs': 1,
        'oob_score': False,
        'verbose': 0,
        'warm_start': False
    }
    print(RED + 'Random Forest feature rankings' + RESET)
    print(
        'Obtain rankings by testing different RF seeds and different data splits'
    )
    print('RF seeds tested: 0-' + str(rf_trials - 1) + ' (inclusive)')
    print('Cross validation splitter seeds tested: 0-' +
          str(kfold_trials - 1) + ' (inclusive)')
    print('Number of splits: ' + str(splits))
    print('Labels tested: [' + ', '.join(v + ' (value of ' + str(k) + ')'
                                         for k, v in labels_key.items()) +
          ']')  #TODO should filtering be done here?
    print('Features tested: ' + str(feature_names))
    print('RF parameters: ' + str(forest_params))
    print()

    trial = 0
    for rf_seed in range(rf_trials):
        clf = ensemble.RandomForestClassifier(random_state=rf_seed,
                                              **forest_params)
        for kfold_seed in range(kfold_trials):
            splitter = StratifiedKFold(n_splits=splits,
                                       shuffle=True,
                                       random_state=kfold_seed)
            current_fold = 0
            for train_indices, validate_indices in splitter.split(
                    data, target):
                features_train, features_validate = data[train_indices], data[
                    validate_indices]
                labels_train, labels_validate = target[train_indices], target[
                    validate_indices]

                clf.fit(features_train, labels_train)
                for t in zip(feature_names, clf.feature_importances_):
                    feature_rankings[t[0]][trial] = t[1]
                trial += 1
                print_progress_bar(
                    trial,
                    rf_trials * kfold_trials * splits,
                    prefix='Progress',
                    suffix='rf seed: %d, splitter seed: %d, fold: %d' %
                    (rf_seed, kfold_seed, current_fold))
                current_fold += 1

    print(YELLOW + 'Gini importance averages from ' +
          str(rf_trials * kfold_trials * splits) + ' (' + str(rf_trials) +
          ' * ' + str(kfold_trials) + ' * ' + str(splits) + ') trials' + RESET)
    for t in sorted([(feat, rank) for feat, rank in feature_rankings.items()],
                    key=lambda s: -1 * s[1].mean()):
        print('\t' + '%.6f +/- standard deviation of %.4f' %
              (t[1].mean(), t[1].std()) + ': ' + t[0])
Exemplo n.º 28
0
def random_forest_misclassifications(data, target, file_names, feature_names,
                                     labels_key):
    misclass_counter = Counter()
    rf_trials = 20
    kfold_trials = 20
    splits = 5
    forest_params = {
        'bootstrap': True,
        'class_weight': None,
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'auto',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'min_impurity_split': None,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 10,
        'n_jobs': 1,
        'oob_score': False,
        'verbose': 0,
        'warm_start': False
    }
    print(RED + 'Random Forest misclassifications' + RESET)
    print(
        'Obtain misclassifications by testing different RF seeds and different data splits'
    )
    print('RF seeds tested: 0-' + str(rf_trials - 1) + ' (inclusive)')
    print('Cross validation splitter seeds tested: 0-' +
          str(kfold_trials - 1) + ' (inclusive)')
    print('Number of splits: ' + str(splits))
    print('Labels tested: [' + ', '.join(v + ' (value of ' + str(k) + ')'
                                         for k, v in labels_key.items()) +
          ']')  #TODO should filtering be done here?
    print('Features tested: ' + str(feature_names))
    print('RF parameters: ' + str(forest_params))
    print()

    trial_num = 1
    for rf_seed in range(rf_trials):
        clf = ensemble.RandomForestClassifier(random_state=rf_seed,
                                              **forest_params)
        for kfold_seed in range(kfold_trials):
            splitter = StratifiedKFold(n_splits=splits,
                                       shuffle=True,
                                       random_state=kfold_seed)
            current_fold = 0
            for train_indices, validate_indices in splitter.split(
                    data, target):
                features_train, features_validate = data[train_indices], data[
                    validate_indices]
                labels_train, labels_validate = target[train_indices], target[
                    validate_indices]

                clf.fit(features_train, labels_train)
                results = clf.predict(features_validate)
                expected = labels_validate
                for i in range(len(results)):
                    if results[i] != expected[i]:
                        misclass_counter[file_names[validate_indices[i]]] += 1
                print_progress_bar(
                    trial_num,
                    rf_trials * kfold_trials * splits,
                    prefix='Progress',
                    suffix='rf seed: %d, splitter seed: %d, fold: %d' %
                    (rf_seed, kfold_seed, current_fold))
                trial_num += 1
                current_fold += 1

    print(YELLOW + 'Misclassifications from ' +
          str(rf_trials * kfold_trials * splits) + ' (' + str(rf_trials) +
          ' * ' + str(kfold_trials) + ' * ' + str(splits) + ') trials. ' +
          'Each file was in the testing set 1 / ' + str(splits) +
          ' of the time (' + str(rf_trials * kfold_trials) + ' times).' +
          RESET)
    largest_num_size = str(len(str(max(misclass_counter.values()))))
    for t in sorted([(val, cnt) for val, cnt in misclass_counter.items()],
                    key=lambda s: -s[1]):
        print(('%' + largest_num_size + 'd / %d (%2.3f%%): %s') %
              (t[1], rf_trials * kfold_trials,
               t[1] / rf_trials / kfold_trials * 100, t[0]))
Exemplo n.º 29
0
def random_forest_averaged_cross_validation(data, target, file_names,
                                            feature_names, labels_key):
    numcorrect_numtotal_f1micro_f1macro_f1weighted = []
    rf_trials = 20
    kfold_trials = 20
    splits = 5
    forest_params = {
        'bootstrap': True,
        'class_weight': None,
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'auto',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'min_impurity_split': None,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 10,
        'n_jobs': 1,
        'oob_score': False,
        'verbose': 0,
        'warm_start': False
    }
    print(RED + 'Random Forest averaged cross validation' + RESET)
    print(
        'Obtain misclassifications by testing different RF seeds and different data splits'
    )
    print('RF seeds tested: 0-' + str(rf_trials - 1) + ' (inclusive)')
    print('Cross validation splitter seeds tested: 0-' +
          str(kfold_trials - 1) + ' (inclusive)')
    print('Number of splits: ' + str(splits))
    print('Labels tested: [' + ', '.join(v + ' (value of ' + str(k) + ')'
                                         for k, v in labels_key.items()) +
          ']')  #TODO should filtering be done here?
    print('Features tested: ' + str(feature_names))
    print('RF parameters: ' + str(forest_params))
    print()

    trial_num = 1
    for rf_seed in range(rf_trials):
        clf = ensemble.RandomForestClassifier(random_state=rf_seed,
                                              **forest_params)
        for kfold_seed in range(kfold_trials):
            splitter = StratifiedKFold(n_splits=splits,
                                       shuffle=True,
                                       random_state=kfold_seed)
            current_fold = 0
            for train_indices, validate_indices in splitter.split(
                    data, target):
                features_train, features_validate = data[train_indices], data[
                    validate_indices]
                labels_train, labels_validate = target[train_indices], target[
                    validate_indices]

                clf.fit(features_train, labels_train)
                results = clf.predict(features_validate)
                expected = labels_validate
                numcorrect_numtotal_f1micro_f1macro_f1weighted.append(
                    (sklearn.metrics.accuracy_score(expected,
                                                    results,
                                                    normalize=False),
                     len(results),
                     sklearn.metrics.f1_score(expected,
                                              results,
                                              average='micro'),
                     sklearn.metrics.f1_score(expected,
                                              results,
                                              average='macro'),
                     sklearn.metrics.f1_score(expected,
                                              results,
                                              average='weighted')))
                print_progress_bar(
                    trial_num,
                    rf_trials * kfold_trials * splits,
                    prefix='Progress',
                    suffix='rf seed: %d, splitter seed: %d, fold: %d' %
                    (rf_seed, kfold_seed, current_fold))
                trial_num += 1
                current_fold += 1

    print(YELLOW + 'Averaged percentages from ' +
          str(rf_trials * kfold_trials * splits) + ' (' + str(rf_trials) +
          ' * ' + str(kfold_trials) + ' * ' + str(splits) + ') trials.' +
          RESET)
    print(
        '\t' +
        'Percentage correct: %s%.4f%s%% +/- standard deviation of %.4f%%' %
        (GREEN,
         sum(tup[0]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) /
         sum(tup[1]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100,
         RESET,
         statistics.stdev(
             tup[0] / tup[1]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100))
    print(
        '\t' + 'F1 micro score: %s%.4f%s%% +/- standard deviation of %.4f%%' %
        (GREEN,
         sum(tup[2]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) /
         len(numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100, RESET,
         statistics.stdev(
             tup[2]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100))
    print(
        '\t' + 'F1 macro score: %s%.4f%s%% +/- standard deviation of %.4f%%' %
        (GREEN,
         sum(tup[3]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) /
         len(numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100, RESET,
         statistics.stdev(
             tup[3]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100))
    print(
        '\t' +
        'F1 weighted score: %s%.4f%s%% +/- standard deviation of %.4f%%' %
        (GREEN,
         sum(tup[4]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) /
         len(numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100, RESET,
         statistics.stdev(
             tup[4]
             for tup in numcorrect_numtotal_f1micro_f1macro_f1weighted) * 100))
Exemplo n.º 30
0
from progress_bar import print_progress_bar

##############################################################
print_progress_bar(47, 100, length=30, title='In loop')
print()  # new line after finish

##############################################################
custom_mode = {
    'length': 6,
    'char_begin': '<',
    'char_end': '>',
    'char_full': '#',
    'char_empty': ' ',
    'title': 'my custom mode',
}
print_progress_bar(47, 80, **custom_mode)
print()  # new line after finish

##############################################################
print_progress_bar(12, 20, length=8, char_empty='#', char_full='-')
print()

##############################################################
print_progress_bar(1, 10, length=4)
print()

##############################################################
my_theme = {
    'length': 12,
    'char_begin': '>',
    'char_end': '',