class SampleSpecies(mongoengine.Document): original_id = mongoengine.StringField(required=True) smarter_id = mongoengine.StringField(required=True, unique=True) country = mongoengine.StringField(required=True) species = mongoengine.StringField(required=True) breed = mongoengine.StringField(required=True) breed_code = mongoengine.StringField(min_length=3) # required to search a sample relying only on original ID dataset = mongoengine.ReferenceField(Dataset, db_field="dataset_id", reverse_delete_rule=mongoengine.DENY) # track the original chip_name with sample chip_name = mongoengine.StringField() # define enum types for sex sex = mongoengine.EnumField(SEX) # GPS location # NOTE: X, Y where X is longitude, Y latitude location = mongoengine.PointField() # additional (not modelled) metadata metadata = mongoengine.DictField(default=None) # for phenotypes phenotype = mongoengine.EmbeddedDocumentField(Phenotype, default=None) meta = { 'abstract': True, } def save(self, *args, **kwargs): """Custom save method. Deal with smarter_id before save""" if not self.smarter_id: logger.debug(f"Determining smarter id for {self.original_id}") # get the pymongo connection object conn = mongoengine.connection.get_db(alias=DB_ALIAS) # even is species, country and breed are required fields for # SampleSpecies document, their value will not be evaluated until # super().save() is called. I can't call it before determining # a smarter_id self.smarter_id = getSmarterId(self.species, self.country, self.breed, conn) # default save method super(SampleSpecies, self).save(*args, **kwargs) def __str__(self): return f"{self.smarter_id} ({self.breed})"
class Board(me.Document): name = me.StringField(required=True, unique=True, validation=_not_empty) canvas_width = me.IntField(default=1000) canvas_height = me.IntField(default=1000) background_type = me.EnumField(BackgroundType, default=BackgroundType.EMPTY) background_linespace = me.IntField(default=0.5) cards = me.EmbeddedDocumentListField(Card) owner = me.ReferenceField('User') is_root = me.BooleanField(required=True) meta = {'auto_create_index': True}
class User(me.Document): created_at = me.DateTimeField(default=datetime.utcnow) updated_at = me.DateTimeField(default=datetime.utcnow) username = me.StringField(required=True) email = me.EmailField(unique=True, required=True) password = me.StringField() address = me.StringField(required=True) phone_number = me.StringField(required=True) gender = me.EnumField(GenderEnum, required=True) def to_json(self): return { "_id": str(self.pk), "name": self.username, "email": self.email, "address": self.address, "phone_number": self.phone_number, "gender": self.gender}
class User(me.Document): id = me.StringField(default=lambda: str(uuid4()), primary_key=True) user_name = me.StringField(regex='^([A-Za-z0-9]+_*)+', required=True) password = me.StringField(required=True) group = me.EnumField(UserGroupEnum, required=True) created_at = me.DateTimeField(default=datetime.utcnow) updated_at = me.DateTimeField(default=datetime.utcnow) @classmethod def get_by_id(cls, id: str): result = cls.objects(id=id) if result: return result[0] @classmethod def get_by_user_name(cls, user_name: str): result = cls.objects(user_name=user_name) if result: return result[0]
class MongoDescriptor(me.Document): photo_id = me.LongField(primary_key=True, required=True) coords = me.PointField(db_field="coordinates", required=True) dataset = me.EnumField(DatasetEnum, required=True) binary_descriptor = me.BinaryField(required=True, db_field="descriptor") meta = {'collection': 'flickr.descriptors_512'} @property def descriptor(self): if self.binary_descriptor is not None: return pickle.loads(self.binary_descriptor) else: return None @descriptor.setter def descriptor(self, descriptor_array: np.ndarray): if not isinstance(descriptor_array, np.ndarray): raise ValueError("Descriptor array should be of type np.ndarray") self.binary_descriptor = pickle.dumps(descriptor_array, protocol=2) @property def coordinates(self): return self.coords['coordinates'] @coordinates.setter def coordinates(self, coord_dict): if 'lat' not in coord_dict or 'lng' not in coord_dict: raise ValueError( "error setting coordinates, lat or lng is not in dict") self.coords = [coord_dict['lng'], coord_dict['lat']] def set_coordinates(self, lng, lat): self.coords = [lng, lat] @classmethod def get_ids_and_coords(cls, dataset: DatasetEnum) -> dict: if isinstance(dataset, DatasetEnum): objects = cls.objects(dataset=dataset) else: raise ValueError( f"dataset should be one of DatasetEnum, was {dataset}") total = objects.count() log.debug( f"Getting {dataset.value} dataset from db. Total number of documents in db {total}" ) ids = np.zeros(shape=total, dtype=int) coordinates = np.zeros(shape=(total, 2)) batch_size = 100000 for i, descriptor in enumerate( objects.batch_size(batch_size).only('photo_id', 'coords')): ids[i] = descriptor.photo_id coordinates[i, :] = descriptor.coordinates if (i + 1) % batch_size == 0: log.debug(f"Processed {i + 1} documents") return ids, coordinates @classmethod def get_data_as_arrays(cls, dataset: DatasetEnum = None) -> dict: if isinstance(dataset, DatasetEnum): objects = cls.objects(dataset=dataset) else: raise ValueError( f"dataset should be one of DatasetEnum, was {dataset}") total = objects.count() dim = cls.objects.first().descriptor.shape[0] log.debug( f"Getting {dataset.value} dataset from db. Total number of documents in db {total}" ) descriptors = np.zeros(shape=(total, dim)) ids = np.zeros(shape=total, dtype=int) coordinates = np.zeros(shape=(total, 2)) batch_size = 50000 for i, descriptor in enumerate(objects.batch_size(batch_size)): descriptors[i, :] = descriptor.descriptor ids[i] = descriptor.photo_id coordinates[i, :] = descriptor.coordinates if (i + 1) % batch_size == 0: log.debug(f"Processed {i + 1} documents") return ids, coordinates, descriptors @classmethod def get_random_docs(cls, dataset, sample_size): # Sample is quite slow, don't suggest using it pipeline = [{ "$sample": { "size": sample_size } }, { "$project": { '_id': 1 } }] pipeline_cursor = cls.objects(dataset=dataset).aggregate(pipeline) ids = [doc['_id'] for doc in pipeline_cursor] return cls.objects(photo_id__in=ids)