def loadClusterModule(pathCheckpoint, norm_vec_len=False):
    print(f"Loading ClusterModule at {pathCheckpoint}")
    state_dict = torch.load(pathCheckpoint)
    if "state_dict" in state_dict: #kmeans
        clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]), norm_vec_len)
        clusterModule.load_state_dict(state_dict["state_dict"])
    else: #dpmeans
        clusterModule = kMeanCluster(state_dict["mu"])
    clusterModule = clusterModule.cuda()
    return clusterModule
Exemplo n.º 2
0
def loadClusterModule(pathCheckpoint):
    """
    Load CPC Clustering Module from Clustering checkpoint file.
    """
    state_dict = torch.load(pathCheckpoint, map_location=torch.device('cpu'))
    clusterModule = kMeanCluster(
        torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]))
    clusterModule.load_state_dict(state_dict["state_dict"])
    return clusterModule
Exemplo n.º 3
0
    pathConfig = f"{os.path.splitext(args.pathOutput)[0]}_args.json"
    with open(pathConfig, 'w') as file:
        json.dump(vars(args), file, indent=2)

    out_state_dict = {}
    print("Starting the clustering...")
    start_time = time.time()
    clusters = kMeanGPU(trainLoader,
                        featureMaker,
                        args.nClusters,
                        args.nGroups,
                        perIterSize=args.perIterSize,
                        MAX_ITER=args.MAX_ITER,
                        save=args.save,
                        load=args.load,
                        save_dir=os.path.dirname(args.pathOutput),
                        save_last=args.save_last,
                        norm_vec_len=args.norm_vec_len).cpu()

    print(f'Ran clustering ' f'in {time.time() - start_time:.2f} seconds')

    clusterModule = kMeanCluster(clusters, norm_vec_len=args.norm_vec_len)

    out_state_dict["state_dict"] = clusterModule.state_dict()
    out_state_dict["encoder_layer"] = args.encoder_layer
    out_state_dict["n_clusters"] = args.nClusters
    out_state_dict['dim'] = clusters.size(2)
    torch.save(out_state_dict, args.pathOutput)
    with open(pathConfig, 'w') as file:
        json.dump(vars(args), file, indent=2)
Exemplo n.º 4
0
def main(pathActivations, pathOutput, nGroups=1, nClusters=50, MAX_ITER=100,
         batchSizeGPU=50, debug=False, extension='.pt', getDistanceEstimation=False,
         load=False, perIterSize=-1, recursionLevel=2, save=False, save_last=5,
         seqList=None):
    # Test the extension is valid
    if extension not in ['.txt', '.npy', '.pt']:
        raise ValueError(f'Activation file extension invalid ({extension})')

    torch.cuda.empty_cache()

    args = argparse.Namespace(**locals())
    # Export absolute paths for later use
    pathActivations = os.path.abspath(pathActivations)
    pathOutput = os.path.abspath(pathOutput)

    if not load:
        assert os.path.exists(pathOutput) is False, \
            f"The output file {pathOutput} already exists, please check the option --load !"
        assert os.path.exists(os.path.join(os.path.dirname(pathOutput), "checkpoint_last.pt")) is False, \
            "Found last_checkpoint.pt in the output directory, please check the option --load !"

    print(args)
    seqNames, speakers = findAllSeqs(pathActivations,
                                     speaker_level=recursionLevel,
                                     extension=extension,
                                     loadCache=True)

    if seqList is not None:
        seqNames = filterSeqs(seqList, seqNames)
    if debug:
        nsamples = 1000
        print(f"Debug mode activated, get only {nsamples} samples!")
        shuffle(seqNames)
        seqNames = seqNames[:nsamples]
    if getDistanceEstimation:
        shuffle(seqNames)
        seqNames = seqNames[:5000]

    print("")
    print(f'Loading activations at {pathActivations}')
    start_time = time.time()
    dataset = SequentialData(pathActivations, seqNames, None)
    print(f"Dataset loaded in {time.time()-start_time} seconds !")
    print("")

    nGPUs = torch.cuda.device_count()
    if nGPUs == 0:
        raise RuntimeError('No GPU found')
    batchSize = batchSizeGPU * nGPUs
    dataloader = dataset.getDataLoader(batchSize, numWorkers=0)
    print(f"Length of dataLoader: {len(dataloader)}")
    print("")

    # Check if dir exists
    if not os.path.exists(os.path.dirname(pathOutput)) and os.path.dirname(pathOutput):
        Path(os.path.dirname(pathOutput)).mkdir(parents=True, exist_ok=True)

    pathConfig = f"{os.path.splitext(pathOutput)[0]}_args.json"
    with open(pathConfig, 'w') as file:
        json.dump(vars(args), file, indent=2)

    out_state_dict = {}
    print("Starting the clustering...")
    start_time = time.time()
    # Using a dumb lambda function to skip feature extraction as we start from
    # the activations
    clusters = kMeanGPU(dataloader, lambda x: x, nClusters, nGroups,
                        perIterSize=perIterSize,
                        MAX_ITER=MAX_ITER,
                        save=save, load=load,
                        save_dir=os.path.dirname(pathOutput),
                        save_last=save_last,
                        ).cpu()

    print(f'Ran clustering '
          f'in {time.time() - start_time:.2f} seconds')

    clusterModule = kMeanCluster(clusters)
    out_state_dict["state_dict"] = clusterModule.state_dict()
    out_state_dict["n_clusters"] = nClusters
    out_state_dict['dim'] = clusters.size(2)
    torch.save(out_state_dict, pathOutput)
    with open(pathConfig, 'w') as file:
        json.dump(vars(args), file, indent=2)
Exemplo n.º 5
0
def loadClusterModule(pathCheckpoint, norm_vec_len=False):
    """
    Load CPC Clustering Module from Clustering checkpoint file.
    """
    state_dict = torch.load(pathCheckpoint, map_location=torch.device('cpu'))
    clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"],
                                             state_dict["dim"]),
                                 norm_vec_len=norm_vec_len)
    clusterModule.load_state_dict(state_dict["state_dict"])
    return clusterModule


#def loadRobertaCheckpoint(pathBERTCheckpoint, pathData, from_pretrained=False):
#    """
#    Load Roberta model from checkpoint.
#    If load a pretrained model from fairseq, set from_pretrained=True.
#    """
#    if from_pretrained: # Require connection to download bpe, possible errors for trained checkpoint that contains cfg
#        roberta = RobertaModel.from_pretrained(dirname(pathBERTCheckpoint), basename(pathBERTCheckpoint), pathData)
#    else:
#        # Set up the args Namespace
#        model_args = argparse.Namespace(
#            task='masked_lm',
#            seed=-1,
#            output_dictionary_size=-1,
#            data=pathData,
#            path=pathBERTCheckpoint
#            )
#
#        # Setup task
#        task = tasks.setup_task(model_args)
#
#        # Load model
#        models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task)
#        model = models[0]
#
#        # Wrap-up to RobertaHubInterface (to be consistent with RobertaModel.from_pretrained)
#        roberta = RobertaHubInterface(_model_args, task, model)
#
#    return roberta

#def loadLSTMLMCheckpoint(pathLSTMCheckpoint, pathData):
#    """
#    Load lstm_lm model from checkpoint.
#    """
#    # Set up the args Namespace
#    model_args = argparse.Namespace(
#        task='language_modeling',
#        output_dictionary_size=-1,
#        data=pathData,
#        path=pathLSTMCheckpoint
#        )
#
#    # Setup task
#    task = tasks.setup_task(model_args)
#
#    # Load model
#    models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task)
#    model = models[0]
#
#    return model, task