class GroupDocument(esd.DocType): date = esd.Date() aperture = esd.Float() exposure = esd.Float() focal_length = esd.Float() focal_length_35 = esd.Float() iso = esd.Integer() model = esd.String(index='not_analyzed') #analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) lens = esd.String(index='not_analyzed') path = esd.String(index='not_analyzed') dirname = esd.String(index='not_analyzed') basename = esd.String(index='not_analyzed')
def document_field(field): """ The default ``field_factory`` method for converting Django field instances to ``elasticsearch_dsl.Field`` instances. Auto-created fields (primary keys, for example) and one-to-many fields (reverse FK relationships) are skipped. """ if field.auto_created or field.one_to_many: return None if field.many_to_many: return RawMultiString defaults = { models.DateField: dsl.Date(), models.DateTimeField: dsl.Date(), models.IntegerField: dsl.Long(), models.PositiveIntegerField: dsl.Long(), models.BooleanField: dsl.Boolean(), models.NullBooleanField: dsl.Boolean(), # models.SlugField: dsl.String(index='not_analyzed'), models.SlugField: dsl.Text(index='not_analyzed'), models.DecimalField: dsl.Double(), models.FloatField: dsl.Float(), } return defaults.get(field.__class__, RawString)
def doc_field(type): defaults = { 'date': dsl.Date(), 'integer': dsl.Long(), 'boolean': dsl.Boolean(), 'double': dsl.Double(), 'float': dsl.Float(), } return defaults.get(type, RawString)
class PhotoDocument(esd.DocType): date = esd.Date() aperture = esd.Float() exposure = esd.Float() focal_length = esd.Float() focal_length_35 = esd.Float() iso = esd.Integer() size = esd.Integer() model = esd.String(index='not_analyzed') #analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) model_ci = esd.String(analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) lens = esd.String(index='not_analyzed') lens_ci = esd.String(analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) path = esd.String(index='not_analyzed') dirname = esd.String(index='not_analyzed') basename = esd.String(index='not_analyzed') def extended_dict(self): dct = self.to_dict() dct["id"] = self.meta.id return dct
class TopicDocument(es.Document): topic_id = es.Keyword() topic_weight = es.Float() document_es_id = es.Keyword() datetime = es.Date() document_source = es.Keyword() document_corpus = es.Keyword() document_num_views = es.Integer() document_num_comments = es.Integer() class Index: name = ES_INDEX_TOPIC_DOCUMENT # f"{ES_INDEX_TOPIC_DOCUMENT}_{tm}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } settings_dynamic = { "number_of_shards": 2, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "datetime": { "type": "date" }, "document_es_id": { "type": "keyword", }, "document_source": { "type": "keyword", }, "document_corpus": { "type": "keyword", }, "document_num_views": { "type": "long", }, "document_num_comments": { "type": "long", }, "topic_id": { "type": "keyword", }, "topic_weight": { "type": "float" } } }
def decorator(cls): print("setup_schema:" + cls.__name__.lower()) # # create an elastic model from the schema # # there are two special keys you can use additionally to the # standard cerberus syntx: # "elastic" : add any Elastic DSL "Column" __init__ kwargs here, they will be handed raw # to the Column __init__ # "elastictype" : add a more specific elasticserach_dsl type definition (Text instead of string) # the two special keys will be removed from the schema at the end of this # decorator. # # # now set the right elastic types for the doc # from datetime import datetime #from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, Integer\ # Float, Byte, Text, analyzer, InnerObjectWrapper, Completion import elasticsearch_dsl for elem in cls.schema.keys(): #print(elem) # the raw Column __init__ parameters dict elastic=cls.schema[elem].get("elastic", {}) if cls.schema[elem]["type"] == "integer": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "float": setattr(cls, elem, elasticsearch_dsl.Float(**elastic)) elif cls.schema[elem]["type"] == "string": setattr(cls, elem, elasticsearch_dsl.Text(**elastic)) elif cls.schema[elem]["type"] == "bool": setattr(cls, elem, elasticsearch_dsl.Boolean(**elastic)) elif cls.schema[elem]["type"] == "date": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "datetime": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "number": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "binary": setattr(cls, elem, elasticsearch_dsl.Byte(**elastic)) elif cls.schema[elem]["type"] == "list": setattr(cls, elem, elasticsearch_dsl.Keyword(**elastic)) else: raise Exception("Wrong Datatype in schema") #print(" .. removing the schema (raw) elastic key(s)") cls.schema[elem].pop("elastic", None) cls.schema[elem].pop("elastictype", None) return cls
class META_DTM(es.Document): meta_name = es.Keyword() volume_days = es.Float() delta_days = es.Float() reset_index = es.Boolean() from_date = es.Date() to_date = es.Date() class Index: name = ES_INDEX_META_DTM using = ES_CLIENT settings = { "number_of_shards": 1, "number_of_replicas": 1, } mappings = { "properties": { "meta_name": { "type": "keyword", }, "volume_days": { "type": "float", }, "delta_days": { "type": "float", }, "reset_index": { "type": "boolean", }, "from_date": { "type": "date" }, "to_date": { "type": "date" } }, }
class sigpac_record(dsl.DocType): dn_pk = dsl.Long() provincia = dsl.Integer() municipio = dsl.Integer() poligono = dsl.Integer() parcela = dsl.Integer() recinto = dsl.Integer() zona = dsl.Integer() perimetro = dsl.Long() superficie = dsl.Long() pend_med = dsl.Integer() points = dsl.GeoShape() bbox = dsl.GeoShape() bbox_center = dsl.GeoPoint(lat_lon=True) uso_sigpac = dsl.String() agregado = dsl.Integer() cap_auto = dsl.Integer() cap_manual = dsl.Integer() coef_regadio = dsl.Float() c_refpar = dsl.String() c_refpol = dsl.String() c_refrec = dsl.String() dn_oid = dsl.Long() elevation = dsl.Float() def save(self, **kwargs): return super(sigpac_record, self).save(**kwargs) class Meta: index = 'plots' doc_type = 'sigpac'
class Text(es.InnerDoc): """Simple Elasticsearch DSL mapping of the text data this plugin will return.""" full_text = es.Text() pattern_polarity = es.Float() pattern_subjectivity = es.Float() short_text = es.Text() translated = es.Text() truncated = es.Boolean() tweet_length = es.Integer() vader_compound = es.Float() vader_compound_inverted = es.Float() vader_negative = es.Float() vader_neutral = es.Float() vader_positive = es.Float()
class DocumentLocation(es.Document): document_es_id = es.Keyword() document_datetime = es.Date() document_source = es.Keyword() location_name = es.Keyword() location_level = es.Keyword() location_weight = es.Float() location_id = es.Keyword() class Index: name = ES_INDEX_DOCUMENT_LOCATION # !!! f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion.id}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "document_datetime": { "type": "date" }, "document_es_id": { "type": "keyword" }, "document_source": { "type": "keyword" }, "location_level": { "type": "keyword" }, "location_name": { "type": "keyword" }, "location_weight": { "type": "float" }, "location_id": { "type": "keyword" }, } }
class DocumentEval(es.Document): value = es.Float() document_es_id = es.Keyword() document_datetime = es.Date() document_source = es.Keyword() topic_ids_top = es.Keyword() topic_ids_bottom = es.Keyword() class Index: name = ES_INDEX_DOCUMENT_EVAL # !!! f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion.id}{_neg}{_m4a}{_m4a_class}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "document_datetime": { "type": "date" }, "document_es_id": { "type": "keyword" }, "document_source": { "type": "keyword" }, "value": { "type": "float" }, "topic_ids_top": { "type": "keyword" }, "topic_ids_bottom": { "type": "keyword" }, } }
class TopicModellingIndex(es.Document): corpus = es.Keyword() source = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() has_topic_info = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() datetime_from = es.Date() datetime_to = es.Date() algorithm = es.Keyword() number_of_topics = es.Integer() hierarchical = es.Boolean() meta_parameters = es.Object() perplexity = es.Float() purity = es.Float() contrast = es.Float() coherence = es.Float() tau_smooth_sparse_theta = es.Float() tau_smooth_sparse_phi = es.Float() tau_decorrelator_phi = es.Float() tau_coherence_phi = es.Float() topics = es.Nested(Topic) is_actualizable = es.Boolean() class Index: name = ES_INDEX_TOPIC_MODELLING using = ES_CLIENT
class InfoRiegoRecord(dsl.DocType): code = dsl.String() location = dsl.String() date = dsl.Date() rain = dsl.Float() temperature = dsl.Float() rel_humidity = dsl.Float() radiation = dsl.Float() wind_speed = dsl.Float() wind_direction = dsl.Float() lat_lon = dsl.GeoPoint(lat_lon=True) station_height = dsl.Integer() def save(self, **kwargs): return super(InfoRiegoRecord, self).save(**kwargs) class Meta: index = 'inforiego'
class ValidationJob(elasticsearch_dsl.Document): """ Object for validation data. TODO: Can this be merged with TrainingJob, with a common parent object? """ id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() params = elasticsearch_dsl.Text() raw_log = elasticsearch_dsl.Text() # Metrics purported_acc = elasticsearch_dsl.Float() purported_loss = elasticsearch_dsl.Float() purported_sensitivity = elasticsearch_dsl.Float() avg_test_acc = elasticsearch_dsl.Float() avg_test_loss = elasticsearch_dsl.Float() avg_test_sensitivity = elasticsearch_dsl.Float() avg_test_specificity = elasticsearch_dsl.Float() avg_test_true_pos = elasticsearch_dsl.Float() avg_test_false_neg = elasticsearch_dsl.Float() avg_test_auc = elasticsearch_dsl.Float() best_test_acc = elasticsearch_dsl.Float() best_test_loss = elasticsearch_dsl.Float() best_test_sensitivity = elasticsearch_dsl.Float() best_test_specificity = elasticsearch_dsl.Float() best_test_true_pos = elasticsearch_dsl.Float() best_test_false_neg = elasticsearch_dsl.Float() best_test_auc = elasticsearch_dsl.Float() best_end_val_acc = elasticsearch_dsl.Float() best_end_val_loss = elasticsearch_dsl.Float() best_max_val_acc = elasticsearch_dsl.Float() best_max_val_loss = elasticsearch_dsl.Float() # Params batch_size = elasticsearch_dsl.Integer() val_split = elasticsearch_dsl.Float() seed = elasticsearch_dsl.Integer() rotation_range = elasticsearch_dsl.Float() width_shift_range = elasticsearch_dsl.Float() height_shift_range = elasticsearch_dsl.Float() shear_range = elasticsearch_dsl.Float() zoom_range = elasticsearch_dsl.Keyword() horizontal_flip = elasticsearch_dsl.Boolean() vertical_flip = elasticsearch_dsl.Boolean() dropout_rate1 = elasticsearch_dsl.Float() dropout_rate2 = elasticsearch_dsl.Float() data_dir = elasticsearch_dsl.Keyword() gcs_url = elasticsearch_dsl.Keyword() mip_thickness = elasticsearch_dsl.Integer() height_offset = elasticsearch_dsl.Integer() pixel_value_range = elasticsearch_dsl.Keyword() # We need to keep a list of params for the parser because # we can't use traditional approaches to get the class attrs params_to_parse = [ 'batch_size', 'val_split', 'seed', 'rotation_range', 'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range', 'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2', 'data_dir', 'gcs_url', 'mip_thickness', 'height_offset', 'pixel_value_range' ] class Index: name = VALIDATION_JOBS
class Topic(es.InnerDoc): id = es.Keyword() name = es.Keyword() topic_words = es.Nested(TopicWord) topic_size = es.Integer() topic_weight = es.Float()
machine_ad = edsl.Mapping.from_es( doc_type="machine_ad", index=options.indexname, using=es ) if not "claims" in machine_ad or not "failed" in machine_ad.to_dict()['machine_ad']['properties']['claims']['properties']: machine_ad.field( "jobs", edsl.Object(properties={status: edsl.Text(multi=True) for status in STATUSES}), ) machine_ad.field( "claims", edsl.Object( properties={ status: edsl.Object( properties={resource: edsl.Float() for resource in RESOURCES} ) for status in STATUSES } ), ) machine_ad.field( "occupancy", edsl.Object( properties={ status: edsl.Object( properties={resource: edsl.Float() for resource in RESOURCES} ) for status in STATUSES + ("total",) } ),
class Geocomplete(es.DocType): class Meta: index = 'geocomplete' doc_type = 'geoloc-entry' french_elision = es.token_filter('french_elision', type='elision', articles_case=True, articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ]) geocompletion_ngram_filter = es.token_filter('geocompletion_ngram', type='edgeNGram', min_gram=1, max_gram=50, side='front') town_filter = es.token_filter('town_filter', type='pattern_replace', pattern=' ', replacement='-') geocompletion_index_tokenizer = es.tokenizer( 'geocompletion_index_tokenizer', type='pattern', pattern='@') geocompletion_index_analyzer = es.analyzer( 'geocompletion_index_analyzer', type='custom', tokenizer=geocompletion_index_tokenizer, filter=[ 'lowercase', 'asciifolding', french_elision, town_filter, geocompletion_ngram_filter ]) geocompletion_search_analyzer = es.analyzer( 'geocompletion_search_analyzer', type='custom', tokenizer=geocompletion_index_tokenizer, filter=['lowercase', 'asciifolding', town_filter, french_elision]) name = es.String(index='analyzed', analyzer=geocompletion_index_analyzer, search_analyzer=geocompletion_search_analyzer, fields=dict(raw=es.String(index='not_analyzed'))) complement = es.String(index='not_analyzed') postal_code_ngram_filter = es.token_filter('postal_code_ngram', type='edgeNGram', min_gram=1, max_gram=5, side='front') postal_code_index_analyzer = es.analyzer('postal_code_index_analyzer', type='custom', tokenizer='standard', filter=[postal_code_ngram_filter]) postal_code_search_analyzer = es.analyzer('postal_code_search_analyzer', type='custom', tokenizer='standard') postal_code = es.String(index='analyzed', analyzer=postal_code_index_analyzer, search_analyzer=postal_code_search_analyzer, fields=dict(raw=es.String(index='not_analyzed'))) geolocation = es.GeoPoint() weight = es.Float() def __init__(self, meta=None, **kwargs): super(Geocomplete, self).__init__(meta, **kwargs) if self.index in compute_index_name(self.index): self._doc_type.index = compute_index_name(self.index) @property def index(self): return self._doc_type.index @property def doc_type(self): return self._doc_type.name
class DictionaryWord(es.Document): dictionary = es.Keyword() word = es.Keyword() word_normal = es.Keyword() is_in_pymorphy2_dict = es.Boolean() is_multiple_normals_in_pymorphy2 = es.Boolean() is_stop_word = es.Boolean() is_latin = es.Boolean() is_kazakh = es.Boolean() n_gram_len = es.Integer() pos_tag = es.Keyword() word_len = es.Integer() word_frequency = es.Integer() word_normal_frequency = es.Integer() document_frequency = es.Integer() document_normal_frequency = es.Integer() word_frequency_relative = es.Float() word_normal_frequency_relative = es.Float() document_frequency_relative = es.Float() document_normal_frequency_relative = es.Float() word_first_capital_ratio = es.Float() word_normal_first_capital_ratio = es.Float() class Index: name = ES_INDEX_DICTIONARY_WORD # f"{ES_INDEX_DICTIONARY_WORD}_{name}{_temp}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, } mappings = { "properties": { "dictionary": { "type": "keyword", }, "word": { "type": "keyword", }, "word_normal": { "type": "keyword", }, "is_in_pymorphy2_dict": { "type": "boolean", }, "is_multiple_normals_in_pymorphy2": { "type": "boolean", }, "is_stop_word": { "type": "boolean", }, "is_latin": { "type": "boolean", }, "is_kazakh": { "type": "boolean", }, "n_gram_len": { "type": "integer", }, "pos_tag": { "type": "keyword", }, "word_len": { "type": "integer", }, "word_frequency": { "type": "integer", }, "word_normal_frequency": { "type": "integer", }, "document_frequency": { "type": "integer", }, "document_normal_frequency": { "type": "integer", }, "word_frequency_relative": { "type": "float", }, "word_normal_frequency_relative": { "type": "float", }, "document_frequency_relative": { "type": "float", }, "document_normal_frequency_relative": { "type": "float", }, "word_first_capital_ratio": { "type": "float", }, "word_normal_first_capital_ratio": { "type": "float", }, }, }
class TopicWord(es.InnerDoc): word = es.Keyword() weight = es.Float()
class TrainingJob(elasticsearch_dsl.Document): id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() ended_at = elasticsearch_dsl.Date() params = elasticsearch_dsl.Text() raw_log = elasticsearch_dsl.Text() model_url = elasticsearch_dsl.Text() # Metrics epochs = elasticsearch_dsl.Integer() train_acc = elasticsearch_dsl.Float() final_val_acc = elasticsearch_dsl.Float() best_val_acc = elasticsearch_dsl.Float() final_val_loss = elasticsearch_dsl.Float() best_val_loss = elasticsearch_dsl.Float() final_val_sensitivity = elasticsearch_dsl.Float() best_val_sensitivity = elasticsearch_dsl.Float() final_val_specificity = elasticsearch_dsl.Float() best_val_specificity = elasticsearch_dsl.Float() final_val_auc = elasticsearch_dsl.Float() best_val_auc = elasticsearch_dsl.Float() # Params batch_size = elasticsearch_dsl.Integer() val_split = elasticsearch_dsl.Float() seed = elasticsearch_dsl.Integer() rotation_range = elasticsearch_dsl.Float() width_shift_range = elasticsearch_dsl.Float() height_shift_range = elasticsearch_dsl.Float() shear_range = elasticsearch_dsl.Float() zoom_range = elasticsearch_dsl.Keyword() horizontal_flip = elasticsearch_dsl.Boolean() vertical_flip = elasticsearch_dsl.Boolean() dropout_rate1 = elasticsearch_dsl.Float() dropout_rate2 = elasticsearch_dsl.Float() data_dir = elasticsearch_dsl.Keyword() gcs_url = elasticsearch_dsl.Keyword() mip_thickness = elasticsearch_dsl.Integer() height_offset = elasticsearch_dsl.Integer() pixel_value_range = elasticsearch_dsl.Keyword() # We need to keep a list of params for the parser because # we can't use traditional approaches to get the class attrs params_to_parse = [ 'batch_size', 'val_split', 'seed', 'rotation_range', 'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range', 'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2', 'data_dir', 'gcs_url', 'mip_thickness', 'height_offset', 'pixel_value_range' ] class Index: name = TRAINING_JOBS