def _getaveragedocsize(model, customclient=None): if customclient is None: client = mongoclient() else: client = customclient try: stats = client['IWLearn'].command('collstats', model.sampletype.__name__ + 's') return stats['avgObjSize']*2.0 if 'avgObjSize' in stats else 100000 finally: if customclient is None: client.close()
def get(model_id): """ :param model_id: :return: model """ collection = mongoclient()['IWLearn']['Models'] doc = collection.find_one( filter={'_id': ObjectId(model_id)}, projection={'_id': 0, 'filepath': 1}) with open(doc['filepath'], 'rb') as f: logging.info('load model from %s' % (doc['filepath'])) return cPickle.load(f)
def __init__(self): self.rule = RelocationRule() self.collection = mongo.mongoclient()['Tutorial']['Predictions']
def generate(experiment_name, model, maxRAM=None, numclasses=None, part_size=None, customclient=None, **kwargs): """ Generate new or extend existing dataset by loading it from MongoDB, and cache it on disk. The disk cache will be split in parts samplewise, each part containing only part_size of samples. Inside of each part, one file per feature will be saved, in the .npy (numpy.save format). The separation into parts allows both for generating datasets larger than RAM as well as reading from such datasets during the training. The saving of features into separate files allows for easy extension of removal of features for existing datasets. :param experiment_name: The name of the subdirectory containing cached dataset. :param model: The model this dataset has to be created for. The model defines both the features that are to be extracted from the samples, as well as the shape of the input matrix. Also, the model defines the type of samples to load in case the query parameter is passed. :param maxRAM: maximum RAM to use for generating the dataset. If you don't pass batch_size in kwargs, the average sample size will be calculated and maxRAM will be used to determine maximal batch_size fitting into the maxRAM limit. If maxRAM is not passed, we take 50% of all RAM on this PC. :param numclasses: in case the data set is for a classifier, pass number of classes. This information cannot be retrieved from the model in case if model.output_shape = (1,). :param part_size: pass number of samples to be contained within each part of dataset separately written to disk. Usually you don't need to care about it, as the part size will be chosen automatically. In case your samples are extremely small or extremely big though, you might need to tweak this parameter. We find the parts of below 4 MiB in byte size to work best. :param customclient: optionally, a mongo client to use (for example if you want to pass a mock client for tests) :param **kwargs: pass arguments to the mongo client find method to load the samples, eg. filter, batch_size or projection :return: a new DataSet instance """ if customclient is None: client = mongoclient() else: client = customclient try: coll = client['IWLearn'][model.sampletype.__name__ + 's'] if 'batch_size' not in kwargs: if maxRAM is None: maxRAM = int(0.5 * os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')) average_doc_size = DataSet._getaveragedocsize(model, client) batch_size = int(maxRAM / average_doc_size) if 'filter' in kwargs: if 'batch_size' not in kwargs: kwargs['batch_size'] = batch_size cursor = coll.find(**kwargs) elif 'pipeline' in kwargs: if 'cursor' not in kwargs: kwargs['cursor'] = {'batchSize': batch_size} cursor = coll.aggregate(**kwargs) else: raise Exception('provide filter or pipeline') logging.info('Determined batch_size is %d' % batch_size) if DataSet._generateImpl( experiment_name, model, lambda: model.sampletype.fromjson(cursor.next()), part_size, numclasses) == 0: raise Exception('Cannot generate set: no samples') except Exception as e: logging.error(e.message) finally: if customclient is None and client is not None: # client.close()