def run(self):
        self._is_running = True

        # TODO: sanitize classlist - currently, erronious (possibly dangerous) classnames will reach the SQL request untouched
        if not path.exists('./instances/'):
            makedirs('./instances/')

        workdir = './instances/{}/'.format(self.instanceName)
        if not path.exists(workdir):
            makedirs(workdir)

        dataset_creator = DatasetCreator(
                self.db.cursor(),
                workdir=workdir,
                class_config=self.trainingargs['class_config'])
        dataset_dataframe = dataset_creator.get_dataset_dataframe()
        self.trainingargs['class_config'] = dataset_dataframe['label'].unique()

        checkpointarg = self.trainingargs['checkpoint_name']
        checkpointarg = './instances/{}/out/'.format(checkpointarg) if checkpointarg else None
        if checkpointarg and not path.exists(checkpointarg): checkpointarg = None

        instance = TrainingInstance(
                dataset_dataframe=dataset_dataframe,
                checkpoint_path=checkpointarg)
        
        # start training
        model, history = instance.train()
        self.insertModel(history, self.instanceName)
        TrainingThread.save_model(model, output_directory=workdir)

        logging.info('Training Complete for model {}'.format(self.instanceName))

        self._is_running = False
        exit()
class TestDatasetCreator(unittest.TestCase):
    def setUp(self):
        self.imageSource = Mock()
        self.datasetSplitter = Mock()
        self.target = DatasetCreator(self.imageSource, self.datasetSplitter)

    def test_ReadAndSplit(self):
        images = Mock()
        dataset = Mock()
        self.imageSource.load.return_value = images
        self.datasetSplitter.split.return_value = dataset

        result = self.target.buildDataset(datasetSplitIn=[0.6, 0.2, 0.2])

        self.imageSource.load.assert_called_with()
        self.datasetSplitter.split.assert_called_with(images, [0.6, 0.2, 0.2])
        self.assertEqual(dataset, result)

    def test_callsPreprocessorIfInformed(self):
        images = Mock()
        dataset = Mock()
        processedDataset = Mock()
        preprocessor = Mock()
        self.imageSource.load.return_value = images
        self.datasetSplitter.split.return_value = dataset
        preprocessor.process.return_value = processedDataset

        self.target = DatasetCreator(self.imageSource, self.datasetSplitter, preprocessor)

        result = self.target.buildDataset(datasetSplitIn=[0.6, 0.2, 0.2])

        self.imageSource.load.assert_called_with()
        self.datasetSplitter.split.assert_called_with(images, [0.6, 0.2, 0.2])
        preprocessor.process.assert_called_with(dataset)
        self.assertEqual(processedDataset, result)
Exemplo n.º 3
0
def CreateDatasetFromRosbag(rosbagName,
                            pickleName,
                            isBebop=True,
                            start_frame=None,
                            end_frame=None):
    """Converts rosbag to format suitable for training/testing. 
	if start_frame, end_frame are unknown, FrameSelector will help you choose how to trim the video

    Parameters
    ----------
    rosbagName : str
        The file location of the rosbag
    pickleName : str
        name of the new .pickle file
    isBebop : bool, optional
        True if you want RGB dataset for the bebop, False if you want Himax-tailored dataset
    start_frame : int, optional
        if known, the timestamp in ns of the frame you wish to start from 
    end_frame : int, optional
        if known, the timestamp in ns of the frame you wish to finish at
    """

    dc = DatasetCreator(rosbagName)
    if (start_frame is None) or (end_frame is None):
        start_frame, end_frame = dc.FrameSelector()

    if isBebop == True:
        dc.CreateBebopDataset(0, pickleName, start_frame, end_frame)
    else:
        dc.CreateHimaxDataset(config.himax_delay, pickleName, start_frame,
                              end_frame)
def CreatePickle(subject_name, rosbagfolder, delay=0):

    rosbagName = rosbagfolder + subject_name + ".bag"
    pickleName = config.folder_path + "/../Pickles/16_4_2020/" + subject_name + ".pickle"
    dc = DatasetCreator(rosbagName)

    dc.CreateHimaxDataset(delay, pickleName, "/image_raw", "optitrack/gapuino",
                          ["optitrack/head"])
def ProcessAllFilesInFolder(rosbag_folder, pickle_folder):
    files = os.listdir(rosbag_folder)

    for f in files:

        #print(f)
        dc = DatasetCreator(rosbag_folder + f)
        pickleName = pickle_folder + os.path.splitext(
            os.path.basename(f))[0] + ".pickle"
        #print(pickleName)
        dc.CreateHimaxDataset(0, pickleName, "/image_raw", "optitrack/gapuino",
                              ["optitrack/head"])
Exemplo n.º 6
0
def main():
    # subject_name = "davide1"
    # rosbagName = config.folder_path + "/data/Hand/" + subject_name + ".bag"
    # pickleName = config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle"
    # CreateDatasetFromRosbag(rosbagName, pickleName, isBebop=True, start_frame=None, end_frame=None)

    subject_name = "session3"
    rosbagName = config.folder_path + "/data/compressed/" + subject_name + ".bag"
    pickleName = config.folder_path + "/data/compressed/" + subject_name + ".pickle"
    #CreateDatasetFromDarioRosbag(rosbagName, pickleName, start_frame=None, end_frame=None)

    dc = DatasetCreator(rosbagName)
    #dc.CreateBebopDataset(0, pickleName, "bebop/image_raw/compressed", "optitrack/drone", ["optitrack/head", "optitrack/hand"])
    dc.CreateHimaxDataset(config.himax_delay, pickleName, "himax_camera",
                          "bebop/image_raw/compressed", "optitrack/drone",
                          ["optitrack/head", "optitrack/hand"])
def training_and_classification_with_kfold_cross_validation(collection_name, k):
    '''
    Training and classification of an autotagger using k-fold cross validation
    '''
    _split_metadata_and_features(collection_name, k)
    for i in range(1,k+1):
        # Create a gaia dataset with the training set
        print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i
        training_features='train/%s_features__fold%d.tsv' % (collection_name, i)
        chunk_size=5000
        dataset_suffix="fold%d" % i
        replace_dataset=True
        dataset_creator = DatasetCreator(collection_name)
        dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset)
            
        # Feature selection over the gaia dataset
        print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i
        dataset='dbs/%s__fold%d.db' % (collection_name, i)
        pca_covered_variance=75
        include_highlevel=True
        feature_selector = FeatureSelector()
        feature_selector.select(dataset, pca_covered_variance, include_highlevel)
        
        # Autotag a given test set
        print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i
        dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i)
        training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i)
        test_features='test/%s_features__fold%d.tsv' % (collection_name, i)
        output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i)
        output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i)
        metric='LC'
        num_sim=18
        threshold=0.2
        autotagger = Autotagger()
        autotagger.train(dataset, training_metadata)
        autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False)
        autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
Exemplo n.º 8
0
def CreateDatasetFromDarioRosbag(rosbagName,
                                 pickleName,
                                 start_frame=None,
                                 end_frame=None):
    """Converts Dario's rosbag to format suitable for training/testing. 
	if start_frame, end_frame are unknown, FrameSelector will help you choose how to trim the video

    Parameters
    ----------
    rosbagName : str
        The file location of the rosbag
    pickleName : str
        name of the new .pickle file
    start_frame : int, optional
        if known, the timestamp in ns of the frame you wish to start from 
    end_frame : int, optional
        if known, the timestamp in ns of the frame you wish to finish at
    """

    dc = DatasetCreator(rosbagName)
    if (start_frame is None) or (end_frame is None):
        start_frame, end_frame = dc.FrameSelector(True)

    dc.CreateBebopDarioDataset(0, pickleName, start_frame, end_frame)
Exemplo n.º 9
0
def TestDatasetCreator():
    subject_name = "davide1"
    dc = DatasetCreator(config.folder_path + "/data/Hand/" + subject_name +
                        ".bag")
    start_frame, end_frame = dc.FrameSelector()
    dc.CreateBebopDataset(
        0, config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle",
        start_frame, end_frame)

    subject_name = "davide2"
    dc2 = DatasetCreator(config.folder_path + "/data/Hand/" + subject_name +
                         ".bag")
    start_frame, end_frame = dc2.FrameSelector()
    dc2.CreateBebopDataset(
        0, config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle",
        start_frame, end_frame)

    folderPath = config.folder_path + "/data/Hand/"
    fileList = ["davide1Hand.pickle", "davide2Hand.pickle"]
    DatasetCreator.JoinPickleFiles(
        fileList, config.folder_path + "/data/Hand/DavideHand.pickle",
        folderPath)
    calibOutputDir = join(
        calibOutputBaseDir,
        'NFilt_{}_weights_{}_DDW_{}_{}_regFactor_{}'.format(
            filtsize, lossWeights, DDx_new, loss_function, regFactor))
    mkdir(calibOutputDir)
    calibOutputPath = join(calibOutputDir, 'outputFilters.rawImage')

    if use_tfrecords:
        trainFilePaths = recordhandler.ConvertDatabaseToTFRecords(
            trainPath, join(trainPath, 'tfrecords'), maxExamples=maxNExamples)
        validFilePaths = recordhandler.ConvertDatabaseToTFRecords(
            validPath, join(validPath, 'tfrecords'), maxExamples=maxNExamples)
    else:
        # get train database
        myCreator = DatasetCreator(trainPath,
                                   NCube=NCube,
                                   NDD=NDD,
                                   maxNExamples=maxNExamples)
        myCreator.cropDDWidth(DDx_new)
        train_database = myCreator.getDataset()

        # get validation database
        myCreator = DatasetCreator(validPath,
                                   NCube=NCube,
                                   NDD=NDD,
                                   maxNExamples=maxNExamples)
        myCreator.cropDDWidth(DDx_new)
        valid_database = myCreator.getDataset()

        assert (train_database['Cubes'].shape[0] % batchSize == 0)
        assert (valid_database['Cubes'].shape[0] % batchSize == 0)
Exemplo n.º 11
0
    def CreateWithSampleGroupingSplitter(imageSource, preprocessor = None, GetSampleNumberFunction = None):
        if GetSampleNumberFunction is None:
            GetSampleNumberFunction = GetSampleNumberFromFilename()
        return DatasetCreator(imageSource = imageSource,
                              datasetSplitter = SampleGroupingDatasetSplitter(GetSampleNumberFunction),
							  preprocessor = preprocessor)
Exemplo n.º 12
0
    def CreateWithFileGroupingSplitter(imageSource, numFilePerImage, preprocessor = None):
        return DatasetCreator(imageSource = imageSource,
                              datasetSplitter = FileGroupingDatasetSplitter(numFilePerImage),
							  preprocessor = preprocessor)
Exemplo n.º 13
0
    def CreateWithPredicateSplitter(imageSource, imgNumbersInValid, imgNumbersInTest, preprocessor = None):
        return DatasetCreator(imageSource = imageSource,
                              datasetSplitter = PredicateDatasetSplitter(
                                  shouldBeInValid = FileNumberRegexMatcher(imgNumbersInValid),
                                  shouldBeInTest = FileNumberRegexMatcher(imgNumbersInTest)),
							  preprocessor = preprocessor)
Exemplo n.º 14
0
    def CreateWithSplitter(imageSource, datasetSplitter, preprocessor = None):
        return DatasetCreator(imageSource = imageSource,
                              datasetSplitter = datasetSplitter,
							  preprocessor = preprocessor)
Exemplo n.º 15
0
def JoinPickles(fileList, picklename):

    picklefolder = config.folder_path + "/../Pickles/16_4_2020/"
    #fileList = {"Clip1.pickle", "Clip2.pickle", "Clip3.pickle", "Clip4.pickle", "Clip5.pickle", "Clip6.pickle"}
    DatasetCreator.JoinPickleFiles(fileList, picklefolder + picklename,
                                   picklefolder)
Exemplo n.º 16
0
def calibEstimatorSanityTest2_createData():
    logfiledir = _LOG_FILE_DIR
    validDir = join(logfiledir, 'Valid')
    trainDir = join(logfiledir, 'Train')

    # define sizes for tests:
    sysPaths = SystemSettings.getSystemPaths('Server')
    sysDims = SystemSettings.getSystemDimensions()
    NCube = sysDims.NCube  # Cube [y, x, lambda] image size
    NDD = list(sysDims.NDD)  # DD [y,x] image size
    NFilt = sysDims.NFilt  # number of coefficients to be estimated for each lambda filter
    DDx_new = sysDims.DDx_new  # the amount of Data influenced by a filter of size 300
    NChannels = NCube[2]

    numTrainExamples = 1000
    numValidExamples = 200

    NCube_train = (numTrainExamples * NCube[0], 1, NCube[1], NCube[2])
    NCube_valid = (numValidExamples * NCube[0], 1, NCube[1], NCube[2])

    # Cube_train = np.random.standard_normal(NCube_train).astype(dtype=np.float32)
    # Cube_valid = np.random.standard_normal(NCube_valid).astype(dtype=np.float32)

    dataCreator = DatasetCreator(directory=sysPaths.trainPath,
                                 NCube=NCube,
                                 NDD=NDD)
    dataCreator.cropDDWidth(DDx_crop=DDx_new)
    train_dataset = dataCreator.getDataset()
    Cube_train = train_dataset['Cubes']
    # Cube_std = np.std(Cube_train)
    #Cube_train = Cube_train / Cube_std
    del train_dataset

    # print('calibEstimatorSanityTest2_createData: Cube: std: {}, mean: {}, min: {}, max: {}'.format(
    #     np.std(Cube_train), np.mean(Cube_train), np.min(Cube_train), np.max(Cube_train)
    # ))

    dataCreator = DatasetCreator(directory=sysPaths.validPath,
                                 NCube=NCube,
                                 NDD=NDD)
    dataCreator.cropDDWidth(DDx_crop=DDx_new)
    valid_dataset = dataCreator.getDataset()
    Cube_valid = valid_dataset['Cubes']
    # Cube_valid = Cube_valid / Cube_std
    del valid_dataset

    Filts_GT = np.squeeze(imhand.readImage(_FILTERS_GT_PATH))
    # crop Filts_GT to the shape of NFilt
    crop_remove_size = int((Filts_GT.shape[1] - NFilt) / 2)
    Filts_GT = Filts_GT[1:32, crop_remove_size:crop_remove_size + NFilt]
    # Filts_GT = np.random.normal(loc=0.0, scale=1.0, size=(31, 301)).astype(dtype=np.float32)

    print('calibEstimatorSanityTest2_createData: Filters size: ({}x{})'.format(
        Filts_GT.shape[0], Filts_GT.shape[1]))

    NDD[1] = DDx_new  # directly use DDx_new instead of the original size which is too big

    DD_train = np.zeros((NCube_train[0], 1, NDD[1], 1), np.float32)
    DD_valid = np.zeros((NCube_valid[0], 1, NDD[1], 1), np.float32)

    # create the DD (Y) image:
    cEst = CalibEstimator(NX=NCube,
                          NY=NDD,
                          L=NChannels,
                          NFilt=NFilt,
                          learningRate=0.01,
                          batchSize=128,
                          a0=Filts_GT)
    cEst.setModeEval()
    cEst.createNPArrayDatasets()
    cEst.buildModel()

    DD_train = cEst.eval(Xeval=Cube_train, Yeval=DD_train)
    DD_valid = cEst.eval(Xeval=Cube_valid, Yeval=DD_valid)

    cEst.resetModel()

    # save results:
    # filters:
    filters_str = join(logfiledir, 'filters_GT.rawImage')
    imhand.writeImage(Filts_GT, filters_str)

    # save training data:
    for ii in range(numTrainExamples):
        cube_str = join(trainDir, 'Img_{}_Cube.rawImage'.format(ii))
        DD_str = join(trainDir, 'Img_{}_DD.rawImage'.format(ii))
        imhand.writeImage(
            np.squeeze(Cube_train[ii * 256:(ii + 1) * 256, :, :, :]), cube_str)
        imhand.writeImage(np.squeeze(DD_train[ii * 256:(ii + 1) * 256, :]),
                          DD_str)

    # save validation data:
    for ii in range(numValidExamples):
        cube_str = join(validDir, 'Img_{}_Cube.rawImage'.format(ii))
        DD_str = join(validDir, 'Img_{}_DD.rawImage'.format(ii))
        imhand.writeImage(
            np.squeeze(Cube_valid[ii * 256:(ii + 1) * 256, :, :, :]), cube_str)
        imhand.writeImage(np.squeeze(DD_valid[ii * 256:(ii + 1) * 256, :]),
                          DD_str)
Exemplo n.º 17
0
def calibEstimatorSanityTest2(subfold=None):
    logfiledir = _LOG_FILE_DIR
    validDir = join(logfiledir, 'Valid')
    trainDir = join(logfiledir, 'Train')

    # define sizes for tests:
    sysDims = SystemSettings.getSystemDimensions()
    NCube = sysDims.NCube  # Cube [y, x, lambda] image size
    NDD = list(sysDims.NDD)  # DD [y,x] image size
    NFilt = sysDims.NFilt  # number of coefficients to be estimated for each lambda filter
    DDx_new = sysDims.DDx_new  # the amount of Data influenced by a filter of size 300
    NChannels = NCube[2]
    NDD[1] = DDx_new  # directly use DDx_new instead of the original size which is too big

    # get train database
    myCreator = DatasetCreator(trainDir, NCube=NCube, NDD=NDD, maxNExamples=-1)
    train_database = myCreator.getDataset()

    # get validation database
    myCreator = DatasetCreator(validDir, NCube=NCube, NDD=NDD, maxNExamples=-1)
    valid_database = myCreator.getDataset()

    Filts_GT = imhand.readImage(join(logfiledir, 'filters_GT.rawImage'))

    train_dict = {
        'Xtrain': train_database['Cubes'],
        'Ytrain': train_database['DDs'],
        'Xvalid': valid_database['Cubes'],
        'Yvalid': valid_database['DDs']
    }

    Cube_train = train_dict['Xtrain']
    print(
        'calibEstimatorSanityTest2_createData: Cube: std: {}, mean: {}, min: {}, max: {}'
        .format(np.std(Cube_train), np.mean(Cube_train), np.min(Cube_train),
                np.max(Cube_train)))

    if subfold is None:
        outFold = logfiledir
    else:
        outFold = join(logfiledir, subfold)
        mkdir(outFold)

    # run a training network and check the output weights
    # estimate calibration:
    cEst = CalibEstimator(NX=NCube,
                          NY=NDD,
                          L=NChannels,
                          NFilt=NFilt,
                          learningRate=0.01,
                          batchSize=100,
                          numEpochs=10,
                          logfiledir=outFold,
                          optimizer='gd')
    cEst.createNPArrayDatasets()
    cEst.buildModel()
    cEst.train(DBtype='NPArray', DBargs=train_dict)
    Filts_Calib = cEst.getCalibratedWeights()
    imhand.writeImage(Filts_Calib, join(outFold, 'Filters_Calib.rawImage'))

    diff = np.squeeze(Filts_Calib) - np.squeeze(Filts_GT)
    maxAbsDiff = np.max(np.abs(diff))
    error = np.sum(np.square(diff)) / diff.size
    print('error norm: {}, max abs error: {}'.format(error, maxAbsDiff))
    cEst.resetModel()
# You should have received a copy of the GNU General Public License
# along with music-autotagging-msordo.  If not, see <http://www.gnu.org/licenses/>.

# Written by Mohamed Sordo (@neomoha)
# Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com
# Website: http://msordo.weebly.com

import os, sys, argparse

from DatasetCreator import DatasetCreator

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Create a Gaia Dataset given a list of feature files')
    parser.add_argument('collection_name', help='Name of the collection')
    parser.add_argument('--training-features', default=None, help='A file containing paths to the features of the audios used for training (default="train/COLLECTIONNAME_features.txt")')
    parser.add_argument('--chunk-size', type=int, default=5000, help='The dataset will be created in chunks of N songs at a time (default=5000)')
    parser.add_argument('--dataset-suffix', default=None, help='suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)')
    parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true")
    args = parser.parse_args()
    
    if args.training_features is None:
        args.training_features = "train/"+args.collection_name+"_features.tsv"
    
    if not os.path.exists(args.training_features):
        print "Taining features file '%s' not found" % args.training_features
        sys.exit(-1)
    
    print args
    dataset_creator = DatasetCreator(args.collection_name)
    dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)
 def setUp(self):
     self.imageSource = Mock()
     self.datasetSplitter = Mock()
     self.target = DatasetCreator(self.imageSource, self.datasetSplitter)
Exemplo n.º 20
0
    def Create(imageSource, preprocessor = None):
        return DatasetCreator(imageSource = imageSource,
                              datasetSplitter =  ClassBalancingDatasetSplitter(),
							  preprocessor = preprocessor)
def ConvertDatabaseToTFRecords(inFolder, outFolder, maxExamples=-1):
    print('ConvertDatabaseToTFRecord:')
    print('input folder: ' + inFolder)
    print('output folder: ' + outFolder)
    print('maxExamples: ' + str(maxExamples))

    # get the cube and dd file lists:
    sysdims = getSystemDimensions()

    if not isdir(outFolder):
        mkdir(outFolder)

    dataCreator = DatasetCreator(directory=inFolder,
                                 NCube=sysdims.NCube,
                                 NDD=sysdims.NDD,
                                 maxNExamples=maxExamples)
    CubeFiles, DDFiles, Filenames = dataCreator.getFileLists()

    # crop dd image indices:
    x_dd_start = int((sysdims.NDD[1] - sysdims.DDx_new) / 2)
    x_dd_end = x_dd_start + sysdims.DDx_new

    # initialize output
    outFiles = []

    outFilePath = join(outFolder,
                       'database_DDW{}.tfrecords'.format(sysdims.DDx_new))
    if isfile(outFilePath):
        # add file to file list and continue:
        return [outFilePath]

    writer = tf.python_io.TFRecordWriter(outFilePath)

    # iterate over all paths:
    for cubepath, ddpath, filename in zip(CubeFiles, DDFiles, Filenames):

        # read images:
        cubeim = imhand.readImage(cubepath)
        ddim = imhand.readImage(ddpath)[:, x_dd_start:x_dd_end, :]
        cubeheight = cubeim.shape[0]
        cubewidth = cubeim.shape[1]
        cubechannels = cubeim.shape[2]
        ddheight = ddim.shape[0]
        ddwidth = ddim.shape[1]

        for ii in range(cubeheight):
            # convert image stripes to string:
            cube_raw = cubeim[ii, :, :].tostring()
            dd_raw = ddim[ii, :, :].tostring()

            # create a feature:
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'cubewidth': _int64_feature(cubewidth),
                    'cubechannels': _int64_feature(cubechannels),
                    'ddwidth': _int64_feature(ddwidth),
                    'Cube': _bytes_feature(cube_raw),
                    'DD': _bytes_feature(dd_raw)
                }))

            # write feature to file:
            writer.write(example.SerializeToString())

    # close file:
    writer.close()

    return [outFilePath]
    parser.add_argument(
        '--chunk-size',
        type=int,
        default=5000,
        help=
        'The dataset will be created in chunks of N songs at a time (default=5000)'
    )
    parser.add_argument(
        '--dataset-suffix',
        default=None,
        help=
        'suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)'
    )
    parser.add_argument('-r',
                        '--replace-dataset',
                        help='Replace old dataset (if it exists)',
                        action="store_true")
    args = parser.parse_args()

    if args.training_features is None:
        args.training_features = "train/" + args.collection_name + "_features.tsv"

    if not os.path.exists(args.training_features):
        print "Taining features file '%s' not found" % args.training_features
        sys.exit(-1)

    print args
    dataset_creator = DatasetCreator(args.collection_name)
    dataset_creator.create(args.training_features, args.chunk_size,
                           args.dataset_suffix, args.replace_dataset)