Python remove_duplicates примеры, libs.dataset_utils.remove_duplicates Python примеры использования

Пример #1

0

Показать файл

Файл: utils.py Проект: olavosamp/semiauto-video-annotation

def dataset_inference_unlabeled(dataset_path,
                                data_transforms,
                                model_path,
                                save_path,
                                batch_size=64,
                                force=False,
                                seed=None,
                                verbose=True):
    '''
        Perform inference on an unlabeled dataset, using a csv Index file as reference.
        
        force: Boolean
            If force is False, search for an existing output file and use it, if it exists. If force is
        True or output file doesn't exist, compute dataset output and save to file.
    '''
    if os.path.isfile(save_path) and not (force):
        outputDf = utils.load_pickle(save_path)
        if len(outputDf) > 0:
            return outputDf

    unlabelIndex = IndexManager(dataset_path)

    # Drop duplicated files
    unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index,
                                                  "FrameHash")

    # Drop missing or corrupt images
    unlabelIndex.index = dutils.check_df_files(unlabelIndex.index,
                                               utils.check_empty_file,
                                               "FramePath")

    imagePathList = unlabelIndex.index["FramePath"].values
    datasetLen = len(imagePathList)

    if verbose:
        print("\nUnlabeled set inference")
        print("\nDataset information: ")
        print("\t", datasetLen, "images.")

    # Label list for an unlabeled dataset (bit of a hack? is there a better way?)
    labelList = np.zeros(datasetLen)

    outputDf = _model_inference(imagePathList, data_transforms, labelList,
                                model_path, batch_size)

    ## Save output to pickle file
    if verbose:
        print("\nSaving outputs file to ", save_path)
    outputDf.to_pickle(save_path)

Пример #2

0

Показать файл

    # TODO: Encapsulate sampledImages processing in function
    # Add FrameHash column
    if "imagem" in sampledIndex.columns:
        fileList = sampledIndex["imagem"].values
    elif "FrameName" in sampledIndex.columns:
        fileList = sampledIndex["FrameName"].values
    else:
        raise KeyError("DataFrame doesn't have a known image path column.")
    
    sampledIndex["FrameHash"] = utils.compute_file_hash_list(fileList, folder=dirs.febe_image_dataset)

    # Get missing information from original Unlabeled index
    sampledIndex = dutils.fill_index_information(originalUnlabeledIndex, sampledIndex,
                                            "FrameHash", [ 'rede1', 'rede2', 'rede3'])

    sampledIndex = dutils.remove_duplicates(sampledIndex, "FrameHash")
    autoIndex    = dutils.remove_duplicates(autoIndex, "FrameHash")

    # TODO: Use merge_indexes in merge_manual_auto_sets
    mergedIndex = dutils.merge_manual_auto_sets(sampledIndex, autoIndex)
    print(mergedIndex.shape)

    mergedIndex.to_csv(mergedIndexPath, index=False)

    ## Create unlabeled set for next iteration
    # TODO: Encapsulate this section in function
    print("\nCreate new unlabeled set.")
    mergedPathList = [get_iter_folder(x) / \
        "final_annotated_images_iteration_{}.csv".format(x) for x in range(1, iteration+1)]
    mergedIndexList = [pd.read_csv(x) for x in mergedPathList]
    originalUnlabeledIndex  = pd.read_csv(originalUnlabeledIndexPath)

Пример #3

0

Показать файл

    indexSampled = IndexManager(sampledIndexPath)
    if "imagem" in indexSampled.index.columns:
        indexSampled.index["FrameName"] = indexSampled.index["imagem"].copy()
    indexSampled.index["FramePath"] = indexSampled.index["FrameName"].map(_add_folder_path)
    
    eTime = indexSampled.compute_frame_hashes(reference_column="FramePath", verbose=True)

    indexSampled.write_index(dest_path=manualIndexPath, make_backup=False, prompt=False)

    ## Merge manual annotated labels from current and previous iterations
    if iteration > 1:
        oldLabels = pd.read_csv(prevManualIndexPath)
        newLabels = pd.read_csv(manualIndexPath)

        # Remove duplicates
        oldLabels = dutils.remove_duplicates(oldLabels, "FrameHash")
        newLabels = dutils.remove_duplicates(newLabels, "FrameHash")

        # Get additional information for newLabels from main unlabeled index
        # TODO: Don't do this again when merging auto and manual annotated indexes
        originalUnlabeledIndex = pd.read_csv(originalUnlabeledIndexPath)
        originalUnlabeledIndex = dutils.remove_duplicates(originalUnlabeledIndex, "FrameHash")

        newLabels = dutils.fill_index_information(originalUnlabeledIndex, newLabels,
                                                 "FrameHash", [ 'rede1', 'rede2', 'rede3'])
        oldLabels = dutils.fill_index_information(originalUnlabeledIndex, oldLabels,
                                                 "FrameHash", [ 'rede1', 'rede2', 'rede3'])

        mergedIndex = pd.concat([newLabels, oldLabels], axis=0, sort=False)
        mergedIndex.to_csv(manualIndexPath, index=False)

Пример #4

0

Показать файл

Файл: iteration_part_final.py Проект: olavosamp/semiauto-video-annotation

    dirs.images) / "{}_results_samples".format(datasetName)
loopFolder = Path(dirs.iter_folder) / datasetName
referenceIndexPath = loopFolder / "iteration_0/reference_images.csv"
previousLevelIndexPath = loopFolder / "iteration_0/unlabeled_images_iteration_0.csv"
autoIndexFullPath = loopFolder / "final_automatic_images_{}.csv".format(
    datasetName)
manualIndexFullPath = loopFolder / "final_manual_images_{}.csv".format(
    datasetName)
annotatedIndexFullPath = loopFolder / "final_annotated_images_{}.csv".format(
    datasetName)
reportPath = loopFolder / "annotation_report.txt"
binaryDatasetPath = Path(
    dirs.iter_folder) / "dataset_rede_{}_eval_setup.csv".format(rede)

referenceIndex = pd.read_csv(referenceIndexPath, low_memory=False)
referenceIndex = dutils.remove_duplicates(referenceIndex, "FrameHash")

previousLevelIndex = pd.read_csv(previousLevelIndexPath, low_memory=False)
previousLevelIndex = dutils.remove_duplicates(previousLevelIndex, "FrameHash")

# Get list of all iteration folders
folderList = utils.make_path(glob(str(loopFolder) + "/iteration*"))
tempList = copy(folderList)
for path in tempList:
    if not (path.is_dir()):
        folderList.remove(path)

if rede == 3:
    print("\nNet {}, target class {}.".format(rede, target_class))
else:
    print("\nNet {}.".format(rede))

Пример #5

0

Показать файл

    rede        = 1
    batchSize = 64

    unlabelIndexPath  = Path(dirs.iter_folder) / \
                "full_dataset/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration, iteration)
    savedModelsFolder = Path(dirs.saved_models) / "full_dataset_rede_{}/iteration_{}".format(rede, iteration)
    modelPath         = savedModelsFolder / \
                "full_dataset_no_finetune_{}_epochs_rede_{}_iteration_{}.pt".format(epochs, rede, iteration)
    savePath          = savedModelsFolder / \
                "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration)


    unlabelIndex = IndexManager(unlabelIndexPath)

    # Drop duplicated files
    unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index, "FrameHash")

    # Drop missing or corrupt images
    unlabelIndex.index = dutils.check_df_files(unlabelIndex.index, utils.check_empty_file, "FramePath")

    
    imagePathList = unlabelIndex.index["FramePath"].values
    datasetLen    = len(imagePathList)

    print("\nDataset information: ")
    print("\t", datasetLen, "images.")
    
    # ImageNet statistics
    mean    = commons.IMAGENET_MEAN
    std     = commons.IMAGENET_STD

Пример #6

0

Показать файл

Файл: merge_labeled_sets.py Проект: olavosamp/semiauto-video-annotation

    mergedIndex = pd.concat([manualIndex, autoIndex], axis=0, sort=False)
    return mergedIndex.copy()


unlabeledPath = Path(
    dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv"
manualPath = Path(dirs.iter_folder
                  ) / "full_dataset/iteration_1/sampled_images_iteration_1.csv"
autoPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_1/automatic_labeled_images_iteration_1.csv"

manualIndex = pd.read_csv(manualPath)
autoIndex = pd.read_csv(autoPath)

manualIndex = dutils.remove_duplicates(manualIndex, "FrameHash")
autoIndex = dutils.remove_duplicates(autoIndex, "FrameHash")

# Get additional information for manualIndex from main unlabeled index
# TODO: Do this as the second iteration step
unlabeledIndex = pd.read_csv(unlabeledPath)
unlabeledIndex = dutils.remove_duplicates(unlabeledIndex, "FrameHash")

manualIndex = fill_index_information(unlabeledIndex, manualIndex, "FrameHash",
                                     ["rede1", "rede2", "rede3", "set"])
print(manualIndex.head())
print(manualIndex.shape)

mergedIndex = merge_labeled_sets(manualIndex, autoIndex)
print(mergedIndex.head())
print(mergedIndex.shape)

Пример #7

0

Показать файл

    dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv"
sampledIndexPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_1/final_annotated_images_iteration_1.csv"

newUnlabelIndexPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_2/unlabeled_images_iteration_2.csv"

# Load model outputs and unlabeled images index
# indexUnlabel = IndexManager(unlabelIndexPath)
# indexSampled = IndexManager(sampledIndexPath)
indexUnlabel = pd.read_csv(unlabelIndexPath)
indexSampled = pd.read_csv(sampledIndexPath)
print(indexUnlabel.index.shape)

indexUnlabel = dutils.remove_duplicates(indexUnlabel, "FrameHash")

indexUnlabel.set_index("FrameHash", drop=False, inplace=True)
indexSampled.set_index("FrameHash", drop=False, inplace=True)

print(indexUnlabel.index.duplicated().sum())
print(indexSampled.index.duplicated().sum())

newIndex = dutils.index_complement(indexUnlabel, indexSampled, "FrameHash")
print(newIndex.shape)

# indexUnlabel.write_index(newUnlabelIndexPath, prompt=False, backup=False)
dirs.create_folder(newUnlabelIndexPath.parent)
newIndex.to_csv(newUnlabelIndexPath, index=False)

Пример #8

0

Показать файл

print("\nConcatenating positive indexes...")
compiledPositivesIndex = pd.concat(positivesList.values())


def _translate_dup_label(label_list):
    for priority_label in commons.rede_3_multiclass_priority_table:
        if priority_label in label_list:
            return priority_label
    return None


# Split compiled index in duplicated and non-duplicated entries
duplicatesMask = compiledPositivesIndex.duplicated(
    subset=commons.FRAME_HASH_COL_NAME, keep=False)
duplicatesIndex = compiledPositivesIndex.loc[duplicatesMask, :]
frameGroup = duplicatesIndex.groupby(commons.FRAME_HASH_COL_NAME)

print("\nTranslating all duplicates...")
compiledPositivesIndex = dutils.remove_duplicates(compiledPositivesIndex,
                                                  commons.FRAME_HASH_COL_NAME)
compiledPositivesIndex.set_index('FrameHash', drop=False, inplace=True)
for frameHash, group in tqdm(frameGroup):
    newLabel = _translate_dup_label(group['rede3'].values)
    compiledPositivesIndex.loc[frameHash, 'rede3'] = newLabel

compiledPositivesIndex.reset_index(drop=True, inplace=True)

print(compiledPositivesIndex['rede3'][:30])

compiledPositivesIndex.to_csv(compiledPositivesPath, index=False)

Пример #9

0

Показать файл

                "full_dataset_softmax/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration-1, iteration-1)
savedModelsFolder = Path(
    dirs.saved_models) / "full_dataset_rede_{}_softmax/iteration_{}".format(
        rede, iteration)
outputPath   = savedModelsFolder / \
                "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration)
newIndexPath = Path(dirs.iter_folder) / \
                "full_dataset/iteration_{}/automatic_labeled_images_iteration_{}.csv".format(iteration, iteration)

idealUpperThresh = 0.8923  # Ratio 99%
idealLowerThresh = 0.0904  # Ratio 1%

indexDf = pd.read_csv(indexPath)
pickleData = utils.load_pickle(outputPath)

indexDf = dutils.remove_duplicates(indexDf, "FrameHash")
outputs, imgHashes, _ = dutils.load_outputs_df(outputPath)

outputs = outputs[:, 0]

indexDf.set_index("FrameHash", drop=False, inplace=True)

print("\nAutomatic labeling with upper positive ratio 99%:")
posHashes, negHashes = dutils.automatic_labeling(outputs, imgHashes,
                                                 idealUpperThresh,
                                                 idealLowerThresh)

newLabeledIndex = dutils.get_classified_index(indexDf,
                                              posHashes,
                                              negHashes,
                                              verbose=False)

Python remove_duplicates примеры использования