Python remove_duplicates示例，libs.dataset_utils.remove_duplicates Python示例

示例#1

0

显示文件

文件： utils.py 项目： olavosamp/semiauto-video-annotation

def dataset_inference_unlabeled(dataset_path,
                                data_transforms,
                                model_path,
                                save_path,
                                batch_size=64,
                                force=False,
                                seed=None,
                                verbose=True):
    '''
        Perform inference on an unlabeled dataset, using a csv Index file as reference.
        
        force: Boolean
            If force is False, search for an existing output file and use it, if it exists. If force is
        True or output file doesn't exist, compute dataset output and save to file.
    '''
    if os.path.isfile(save_path) and not (force):
        outputDf = utils.load_pickle(save_path)
        if len(outputDf) > 0:
            return outputDf

    unlabelIndex = IndexManager(dataset_path)

    # Drop duplicated files
    unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index,
                                                  "FrameHash")

    # Drop missing or corrupt images
    unlabelIndex.index = dutils.check_df_files(unlabelIndex.index,
                                               utils.check_empty_file,
                                               "FramePath")

    imagePathList = unlabelIndex.index["FramePath"].values
    datasetLen = len(imagePathList)

    if verbose:
        print("\nUnlabeled set inference")
        print("\nDataset information: ")
        print("\t", datasetLen, "images.")

    # Label list for an unlabeled dataset (bit of a hack? is there a better way?)
    labelList = np.zeros(datasetLen)

    outputDf = _model_inference(imagePathList, data_transforms, labelList,
                                model_path, batch_size)

    ## Save output to pickle file
    if verbose:
        print("\nSaving outputs file to ", save_path)
    outputDf.to_pickle(save_path)

示例#2

0

显示文件

    # TODO: Encapsulate sampledImages processing in function
    # Add FrameHash column
    if "imagem" in sampledIndex.columns:
        fileList = sampledIndex["imagem"].values
    elif "FrameName" in sampledIndex.columns:
        fileList = sampledIndex["FrameName"].values
    else:
        raise KeyError("DataFrame doesn't have a known image path column.")
    
    sampledIndex["FrameHash"] = utils.compute_file_hash_list(fileList, folder=dirs.febe_image_dataset)

    # Get missing information from original Unlabeled index
    sampledIndex = dutils.fill_index_information(originalUnlabeledIndex, sampledIndex,
                                            "FrameHash", [ 'rede1', 'rede2', 'rede3'])

    sampledIndex = dutils.remove_duplicates(sampledIndex, "FrameHash")
    autoIndex    = dutils.remove_duplicates(autoIndex, "FrameHash")

    # TODO: Use merge_indexes in merge_manual_auto_sets
    mergedIndex = dutils.merge_manual_auto_sets(sampledIndex, autoIndex)
    print(mergedIndex.shape)

    mergedIndex.to_csv(mergedIndexPath, index=False)

    ## Create unlabeled set for next iteration
    # TODO: Encapsulate this section in function
    print("\nCreate new unlabeled set.")
    mergedPathList = [get_iter_folder(x) / \
        "final_annotated_images_iteration_{}.csv".format(x) for x in range(1, iteration+1)]
    mergedIndexList = [pd.read_csv(x) for x in mergedPathList]
    originalUnlabeledIndex  = pd.read_csv(originalUnlabeledIndexPath)

示例#3

0

显示文件

    indexSampled = IndexManager(sampledIndexPath)
    if "imagem" in indexSampled.index.columns:
        indexSampled.index["FrameName"] = indexSampled.index["imagem"].copy()
    indexSampled.index["FramePath"] = indexSampled.index["FrameName"].map(_add_folder_path)
    
    eTime = indexSampled.compute_frame_hashes(reference_column="FramePath", verbose=True)

    indexSampled.write_index(dest_path=manualIndexPath, make_backup=False, prompt=False)

    ## Merge manual annotated labels from current and previous iterations
    if iteration > 1:
        oldLabels = pd.read_csv(prevManualIndexPath)
        newLabels = pd.read_csv(manualIndexPath)

        # Remove duplicates
        oldLabels = dutils.remove_duplicates(oldLabels, "FrameHash")
        newLabels = dutils.remove_duplicates(newLabels, "FrameHash")

        # Get additional information for newLabels from main unlabeled index
        # TODO: Don't do this again when merging auto and manual annotated indexes
        originalUnlabeledIndex = pd.read_csv(originalUnlabeledIndexPath)
        originalUnlabeledIndex = dutils.remove_duplicates(originalUnlabeledIndex, "FrameHash")

        newLabels = dutils.fill_index_information(originalUnlabeledIndex, newLabels,
                                                 "FrameHash", [ 'rede1', 'rede2', 'rede3'])
        oldLabels = dutils.fill_index_information(originalUnlabeledIndex, oldLabels,
                                                 "FrameHash", [ 'rede1', 'rede2', 'rede3'])

        mergedIndex = pd.concat([newLabels, oldLabels], axis=0, sort=False)
        mergedIndex.to_csv(manualIndexPath, index=False)

示例#4

0

显示文件

文件： iteration_part_final.py 项目： olavosamp/semiauto-video-annotation

    dirs.images) / "{}_results_samples".format(datasetName)
loopFolder = Path(dirs.iter_folder) / datasetName
referenceIndexPath = loopFolder / "iteration_0/reference_images.csv"
previousLevelIndexPath = loopFolder / "iteration_0/unlabeled_images_iteration_0.csv"
autoIndexFullPath = loopFolder / "final_automatic_images_{}.csv".format(
    datasetName)
manualIndexFullPath = loopFolder / "final_manual_images_{}.csv".format(
    datasetName)
annotatedIndexFullPath = loopFolder / "final_annotated_images_{}.csv".format(
    datasetName)
reportPath = loopFolder / "annotation_report.txt"
binaryDatasetPath = Path(
    dirs.iter_folder) / "dataset_rede_{}_eval_setup.csv".format(rede)

referenceIndex = pd.read_csv(referenceIndexPath, low_memory=False)
referenceIndex = dutils.remove_duplicates(referenceIndex, "FrameHash")

previousLevelIndex = pd.read_csv(previousLevelIndexPath, low_memory=False)
previousLevelIndex = dutils.remove_duplicates(previousLevelIndex, "FrameHash")

# Get list of all iteration folders
folderList = utils.make_path(glob(str(loopFolder) + "/iteration*"))
tempList = copy(folderList)
for path in tempList:
    if not (path.is_dir()):
        folderList.remove(path)

if rede == 3:
    print("\nNet {}, target class {}.".format(rede, target_class))
else:
    print("\nNet {}.".format(rede))

示例#5

0

显示文件

    rede        = 1
    batchSize = 64

    unlabelIndexPath  = Path(dirs.iter_folder) / \
                "full_dataset/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration, iteration)
    savedModelsFolder = Path(dirs.saved_models) / "full_dataset_rede_{}/iteration_{}".format(rede, iteration)
    modelPath         = savedModelsFolder / \
                "full_dataset_no_finetune_{}_epochs_rede_{}_iteration_{}.pt".format(epochs, rede, iteration)
    savePath          = savedModelsFolder / \
                "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration)


    unlabelIndex = IndexManager(unlabelIndexPath)

    # Drop duplicated files
    unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index, "FrameHash")

    # Drop missing or corrupt images
    unlabelIndex.index = dutils.check_df_files(unlabelIndex.index, utils.check_empty_file, "FramePath")

    
    imagePathList = unlabelIndex.index["FramePath"].values
    datasetLen    = len(imagePathList)

    print("\nDataset information: ")
    print("\t", datasetLen, "images.")
    
    # ImageNet statistics
    mean    = commons.IMAGENET_MEAN
    std     = commons.IMAGENET_STD

示例#6

0

显示文件

文件： merge_labeled_sets.py 项目： olavosamp/semiauto-video-annotation

    mergedIndex = pd.concat([manualIndex, autoIndex], axis=0, sort=False)
    return mergedIndex.copy()


unlabeledPath = Path(
    dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv"
manualPath = Path(dirs.iter_folder
                  ) / "full_dataset/iteration_1/sampled_images_iteration_1.csv"
autoPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_1/automatic_labeled_images_iteration_1.csv"

manualIndex = pd.read_csv(manualPath)
autoIndex = pd.read_csv(autoPath)

manualIndex = dutils.remove_duplicates(manualIndex, "FrameHash")
autoIndex = dutils.remove_duplicates(autoIndex, "FrameHash")

# Get additional information for manualIndex from main unlabeled index
# TODO: Do this as the second iteration step
unlabeledIndex = pd.read_csv(unlabeledPath)
unlabeledIndex = dutils.remove_duplicates(unlabeledIndex, "FrameHash")

manualIndex = fill_index_information(unlabeledIndex, manualIndex, "FrameHash",
                                     ["rede1", "rede2", "rede3", "set"])
print(manualIndex.head())
print(manualIndex.shape)

mergedIndex = merge_labeled_sets(manualIndex, autoIndex)
print(mergedIndex.head())
print(mergedIndex.shape)

示例#7

0

显示文件

    dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv"
sampledIndexPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_1/final_annotated_images_iteration_1.csv"

newUnlabelIndexPath = Path(
    dirs.iter_folder
) / "full_dataset/iteration_2/unlabeled_images_iteration_2.csv"

# Load model outputs and unlabeled images index
# indexUnlabel = IndexManager(unlabelIndexPath)
# indexSampled = IndexManager(sampledIndexPath)
indexUnlabel = pd.read_csv(unlabelIndexPath)
indexSampled = pd.read_csv(sampledIndexPath)
print(indexUnlabel.index.shape)

indexUnlabel = dutils.remove_duplicates(indexUnlabel, "FrameHash")

indexUnlabel.set_index("FrameHash", drop=False, inplace=True)
indexSampled.set_index("FrameHash", drop=False, inplace=True)

print(indexUnlabel.index.duplicated().sum())
print(indexSampled.index.duplicated().sum())

newIndex = dutils.index_complement(indexUnlabel, indexSampled, "FrameHash")
print(newIndex.shape)

# indexUnlabel.write_index(newUnlabelIndexPath, prompt=False, backup=False)
dirs.create_folder(newUnlabelIndexPath.parent)
newIndex.to_csv(newUnlabelIndexPath, index=False)

示例#8

0

显示文件

print("\nConcatenating positive indexes...")
compiledPositivesIndex = pd.concat(positivesList.values())


def _translate_dup_label(label_list):
    for priority_label in commons.rede_3_multiclass_priority_table:
        if priority_label in label_list:
            return priority_label
    return None


# Split compiled index in duplicated and non-duplicated entries
duplicatesMask = compiledPositivesIndex.duplicated(
    subset=commons.FRAME_HASH_COL_NAME, keep=False)
duplicatesIndex = compiledPositivesIndex.loc[duplicatesMask, :]
frameGroup = duplicatesIndex.groupby(commons.FRAME_HASH_COL_NAME)

print("\nTranslating all duplicates...")
compiledPositivesIndex = dutils.remove_duplicates(compiledPositivesIndex,
                                                  commons.FRAME_HASH_COL_NAME)
compiledPositivesIndex.set_index('FrameHash', drop=False, inplace=True)
for frameHash, group in tqdm(frameGroup):
    newLabel = _translate_dup_label(group['rede3'].values)
    compiledPositivesIndex.loc[frameHash, 'rede3'] = newLabel

compiledPositivesIndex.reset_index(drop=True, inplace=True)

print(compiledPositivesIndex['rede3'][:30])

compiledPositivesIndex.to_csv(compiledPositivesPath, index=False)

示例#9

0

显示文件

                "full_dataset_softmax/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration-1, iteration-1)
savedModelsFolder = Path(
    dirs.saved_models) / "full_dataset_rede_{}_softmax/iteration_{}".format(
        rede, iteration)
outputPath   = savedModelsFolder / \
                "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration)
newIndexPath = Path(dirs.iter_folder) / \
                "full_dataset/iteration_{}/automatic_labeled_images_iteration_{}.csv".format(iteration, iteration)

idealUpperThresh = 0.8923  # Ratio 99%
idealLowerThresh = 0.0904  # Ratio 1%

indexDf = pd.read_csv(indexPath)
pickleData = utils.load_pickle(outputPath)

indexDf = dutils.remove_duplicates(indexDf, "FrameHash")
outputs, imgHashes, _ = dutils.load_outputs_df(outputPath)

outputs = outputs[:, 0]

indexDf.set_index("FrameHash", drop=False, inplace=True)

print("\nAutomatic labeling with upper positive ratio 99%:")
posHashes, negHashes = dutils.automatic_labeling(outputs, imgHashes,
                                                 idealUpperThresh,
                                                 idealLowerThresh)

newLabeledIndex = dutils.get_classified_index(indexDf,
                                              posHashes,
                                              negHashes,
                                              verbose=False)