Пример #1
0
    def load_info(self):
        if self.iterInfoPath.is_file():
            self.iterInfo = utils.load_pickle(self.iterInfoPath)
        else:
            self.iterInfo = IterInfo(self.unlabeledFolder,
                                     self.unlabeledIndexPath, self.loopFolder)
            dirs.create_folder(self.loopFolder)

            utils.save_pickle(self.iterInfo, self.iterInfoPath)

        return self.iterInfo
Пример #2
0
def dataset_inference_unlabeled(dataset_path,
                                data_transforms,
                                model_path,
                                save_path,
                                batch_size=64,
                                force=False,
                                seed=None,
                                verbose=True):
    '''
        Perform inference on an unlabeled dataset, using a csv Index file as reference.
        
        force: Boolean
            If force is False, search for an existing output file and use it, if it exists. If force is
        True or output file doesn't exist, compute dataset output and save to file.
    '''
    if os.path.isfile(save_path) and not (force):
        outputDf = utils.load_pickle(save_path)
        if len(outputDf) > 0:
            return outputDf

    unlabelIndex = IndexManager(dataset_path)

    # Drop duplicated files
    unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index,
                                                  "FrameHash")

    # Drop missing or corrupt images
    unlabelIndex.index = dutils.check_df_files(unlabelIndex.index,
                                               utils.check_empty_file,
                                               "FramePath")

    imagePathList = unlabelIndex.index["FramePath"].values
    datasetLen = len(imagePathList)

    if verbose:
        print("\nUnlabeled set inference")
        print("\nDataset information: ")
        print("\t", datasetLen, "images.")

    # Label list for an unlabeled dataset (bit of a hack? is there a better way?)
    labelList = np.zeros(datasetLen)

    outputDf = _model_inference(imagePathList, data_transforms, labelList,
                                model_path, batch_size)

    ## Save output to pickle file
    if verbose:
        print("\nSaving outputs file to ", save_path)
    outputDf.to_pickle(save_path)
Пример #3
0
def dataset_inference_val(dataset_path,
                          data_transforms,
                          model_path,
                          save_path,
                          batch_size=64,
                          force=False,
                          seed=None,
                          verbose=True):
    '''
        Perform inference on validation set and save outputs to file.

        force: Boolean
            If force is False, search for an existing output file and use it, if it exists. If force is
        True or output file doesn't exist, compute dataset output and save to file.
    '''
    if os.path.isfile(save_path) and not (force):
        outputDf = utils.load_pickle(save_path)
        if len(outputDf) > 0:
            return outputDf

    # Get list of image paths from dataset folder
    dataset = datasets.ImageFolder(str(dataset_path),
                                   transform=data_transforms,
                                   is_valid_file=utils.check_empty_file)
    imageTupleList = dataset.imgs
    datasetLen = len(imageTupleList)
    labelList = dataset.targets

    imagePathList = np.array(dataset.imgs)[:, 0]

    if verbose:
        print("Validation set inference.")
        print("\nDataset information: ")
        print("\t", datasetLen, "images.")
        print("\nClasses: ")
        for key in dataset.class_to_idx.keys():
            print("\t{}: {}".format(dataset.class_to_idx[key], key))

    outputDf = _model_inference(imagePathList, data_transforms, labelList,
                                model_path, batch_size)

    ## Save output to pickle file
    if verbose:
        print("\nSaving outputs file to ", save_path)
    outputDf.to_pickle(save_path)
    return outputDf
Пример #4
0
def bow_matrix(train_text, test_text, max_features, load_path=None, save_path=None):
    vectorizer = CountVectorizer(max_features=max_features, preprocessor=lambda x: x, tokenizer=lambda x: x)

    if load_path:
        vectorizer.vocabulary_ = utils.load_pickle(load_path)
        features_train = vectorizer.transform(train_text).toarray()
    else:
        features_train = vectorizer.fit_transform(train_text).toarray()

    vocabulary    = vectorizer.vocabulary_
    feature_names = vectorizer.get_feature_names()

    features_test = vectorizer.transform(test_text).toarray()

    new_train_df = pd.DataFrame(data=features_train, columns=feature_names)
    new_test_df  = pd.DataFrame(data=features_test, columns=feature_names)

    if save_path:
        utils.save_pickle(vocabulary, save_path)

    return new_train_df, new_test_df, vocabulary
Пример #5
0
    unlabelNoManualIndex.to_csv(unlabelNoManualPath, index=False)

    # If outputs file already exist, skip inference
    print("\nSTEP: Perform inference on remaining unlabeled set.")
    if not(fullOutputPath.is_file()):
        mutils.dataset_inference_unlabeled(unlabelNoManualPath, dataTransforms['val'], modelPath,
                            fullOutputPath, batch_size=inferBatchSize, seed=seed, verbose=True)
    else:
        print("Output file already exists: {}\nSkipping inference.".format(fullOutputPath))

    print("\nUsing thresholds:\nUpper: {:.4f}\nLower: {:.4f}".format(upperThresh, lowerThresh))

    ## Perform automatic labeling
    print("\nSTEP: Automatic labeling.")
    unlabeledNoManualIndex = pd.read_csv(unlabelNoManualPath)
    pickleData             = utils.load_pickle(fullOutputPath)

    outputs, imgHashes, _  = dutils.load_outputs_df(fullOutputPath)
    outputs = outputs[:, 0]

    print("\nAutomatic labeling with upper positive ratio {:.1f}%:".format(upperThreshPercent*100))
    autoIndex = dutils.automatic_labeling(outputs, imgHashes, unlabeledNoManualIndex, upperThresh,
                                                     lowerThresh, rede, target_class=target_class)
    autoIndex.to_csv(autoLabelIndexPath, index=False)

    plot_outputs_histogram(outputs, lower_thresh=lowerThresh, upper_thresh=upperThresh,
                        title="Unlabeled Outputs Histogram", save_path=unlabelHistogramPath,
                        log=True, show=False)

    ## Merge labeled sets
    print("\nMerge auto and manual labeled sets.")
Пример #6
0
    / "history_{}_no_finetune_{}_epochs_rede_{}_iteration_{}.pickle".format(datasetName, epochs, rede, iteration)

resultsFolder        = Path(dirs.results) / historyPath.stem
nameEnd  = "history_{}_epochs_rede_{}_iteration_{}.pdf".format(epochs, rede, iteration)
lossName = "loss_"     + nameEnd
accName  = "accuracy_" + nameEnd
f1Name   = "f1_"       + nameEnd

if not(historyPath.is_file()):
    print("History file does not exist.\nFile:\n", historyPath)
    print("\nExiting program.")
    exit()

dirs.create_folder(resultsFolder)

history = utils.load_pickle(historyPath)

print(history.keys())
valLoss     = history['loss-val']
trainLoss   = history['loss-train']
trainAcc    = history['acc-train']
valAcc      = history['acc-val']
trainF1     = np.array((history['f1-train']))[:, 0]
valF1       = np.array((history['f1-val']))[:, 0]

plot_model_history([trainLoss, valLoss], data_labels=["Train Loss", "Val Loss"], xlabel="Epochs",
                     ylabel="Loss", title="Training loss history", save_path=resultsFolder / lossName,
                     show=False)

plot_model_history([trainAcc, valAcc], data_labels=["Train Acc", "Val Acc"], xlabel="Epochs",
                     ylabel="Acc", title="Training accuracy history", save_path=resultsFolder / accName,
                    fileLen = entryDf.shape[0]

                    entryDf['Class'] = entryDf['FramePath'].apply(get_class)
                    entryDf['Rede'] = [rede]*fileLen
                    entryDf['Validation'] = [val_type]*fileLen
                    entryDf['Dataset'] = [net_type]*fileLen
                    entryDf['Set'] = entryDf['FramePath'].apply(get_set)

                    print(entryDf.groupby('Class').count())
                    if allDatasets is None:
                        allDatasets = entryDf
                    else:
                        allDatasets = pd.concat([allDatasets, entryDf], ignore_index=True)
                    utils.save_pickle(allDatasets, dfPath)
    else:
        allDatasets = utils.load_pickle(dfPath)

    print(allDatasets.groupby('Rede').count())
    print()
    print(allDatasets.groupby('Dataset').count())
    print(allDatasets.groupby('Set').count())
    targetNet = 'reference'
    tablePath = Path(dirs.results) / 'dataset_counts_sets_{}.xlsx'.format(targetNet)

    index = allDatasets['Dataset'] == targetNet
    view = allDatasets.loc[index, :]
    
    index = allDatasets['Rede'] == 1
    view = allDatasets.loc[index, :]
    
    index = allDatasets['Validation'] == 'ref'
Пример #8
0
indexPath    = Path(dirs.iter_folder) / \
                "full_dataset_softmax/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration-1, iteration-1)
savedModelsFolder = Path(
    dirs.saved_models) / "full_dataset_rede_{}_softmax/iteration_{}".format(
        rede, iteration)
outputPath   = savedModelsFolder / \
                "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration)
newIndexPath = Path(dirs.iter_folder) / \
                "full_dataset/iteration_{}/automatic_labeled_images_iteration_{}.csv".format(iteration, iteration)

idealUpperThresh = 0.8923  # Ratio 99%
idealLowerThresh = 0.0904  # Ratio 1%

indexDf = pd.read_csv(indexPath)
pickleData = utils.load_pickle(outputPath)

indexDf = dutils.remove_duplicates(indexDf, "FrameHash")
outputs, imgHashes, _ = dutils.load_outputs_df(outputPath)

outputs = outputs[:, 0]

indexDf.set_index("FrameHash", drop=False, inplace=True)

print("\nAutomatic labeling with upper positive ratio 99%:")
posHashes, negHashes = dutils.automatic_labeling(outputs, imgHashes,
                                                 idealUpperThresh,
                                                 idealLowerThresh)

newLabeledIndex = dutils.get_classified_index(indexDf,
                                              posHashes,