class TrainingJob(elasticsearch_dsl.Document): id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() ended_at = elasticsearch_dsl.Date() params = elasticsearch_dsl.Text() raw_log = elasticsearch_dsl.Text() model_url = elasticsearch_dsl.Text() # Metrics epochs = elasticsearch_dsl.Integer() train_acc = elasticsearch_dsl.Float() final_val_acc = elasticsearch_dsl.Float() best_val_acc = elasticsearch_dsl.Float() final_val_loss = elasticsearch_dsl.Float() best_val_loss = elasticsearch_dsl.Float() final_val_sensitivity = elasticsearch_dsl.Float() best_val_sensitivity = elasticsearch_dsl.Float() final_val_specificity = elasticsearch_dsl.Float() best_val_specificity = elasticsearch_dsl.Float() final_val_auc = elasticsearch_dsl.Float() best_val_auc = elasticsearch_dsl.Float() # Params batch_size = elasticsearch_dsl.Integer() val_split = elasticsearch_dsl.Float() seed = elasticsearch_dsl.Integer() rotation_range = elasticsearch_dsl.Float() width_shift_range = elasticsearch_dsl.Float() height_shift_range = elasticsearch_dsl.Float() shear_range = elasticsearch_dsl.Float() zoom_range = elasticsearch_dsl.Keyword() horizontal_flip = elasticsearch_dsl.Boolean() vertical_flip = elasticsearch_dsl.Boolean() dropout_rate1 = elasticsearch_dsl.Float() dropout_rate2 = elasticsearch_dsl.Float() data_dir = elasticsearch_dsl.Keyword() gcs_url = elasticsearch_dsl.Keyword() mip_thickness = elasticsearch_dsl.Integer() height_offset = elasticsearch_dsl.Integer() pixel_value_range = elasticsearch_dsl.Keyword() # We need to keep a list of params for the parser because # we can't use traditional approaches to get the class attrs params_to_parse = [ 'batch_size', 'val_split', 'seed', 'rotation_range', 'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range', 'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2', 'data_dir', 'gcs_url', 'mip_thickness', 'height_offset', 'pixel_value_range' ] class Index: name = TRAINING_JOBS
def document_field(field): """ The default ``field_factory`` method for converting Django field instances to ``elasticsearch_dsl.Field`` instances. Auto-created fields (primary keys, for example) and one-to-many fields (reverse FK relationships) are skipped. """ if field.auto_created or field.one_to_many: return None if field.many_to_many: return RawMultiString defaults = { models.DateField: dsl.Date(), models.DateTimeField: dsl.Date(), models.IntegerField: dsl.Long(), models.PositiveIntegerField: dsl.Long(), models.BooleanField: dsl.Boolean(), models.NullBooleanField: dsl.Boolean(), # models.SlugField: dsl.String(index='not_analyzed'), models.SlugField: dsl.Text(index='not_analyzed'), models.DecimalField: dsl.Double(), models.FloatField: dsl.Float(), } return defaults.get(field.__class__, RawString)
def doc_field(type): defaults = { 'date': dsl.Date(), 'integer': dsl.Long(), 'boolean': dsl.Boolean(), 'double': dsl.Double(), 'float': dsl.Float(), } return defaults.get(type, RawString)
class GameSummary(elasticsearch_dsl.Document): """Game search model""" id = elasticsearch_dsl.Text() name = elasticsearch_dsl.Text() isPublic = elasticsearch_dsl.Boolean() players = elasticsearch_dsl.Object(PlayersInGame) class Index: # pylint: disable=missing-class-docstring name = "games"
class EntityDocument(CollectionDocument): """Document for entity search.""" descriptor_completed = dsl.Boolean() collections = dsl.Integer(multi=True) class Meta: """Meta class for entity search document.""" index = 'entity'
class Dictionary(es.Document): corpus = es.Keyword() name = es.Keyword() description = es.Text() datetime = es.Date() number_of_documents = es.Integer() is_ready = es.Boolean() class Index: name = ES_INDEX_DICTIONARY_INDEX using = ES_CLIENT
class DataDocType(es.DocType): """Elasticsearch test model""" first_name = es.Keyword() last_name = es.Keyword() city = es.Text() skills = es.Keyword() birthday = es.Date() is_active = es.Boolean() score = es.Integer() description = es.Text() class Meta: index = 'test'
def decorator(cls): print("setup_schema:" + cls.__name__.lower()) # # create an elastic model from the schema # # there are two special keys you can use additionally to the # standard cerberus syntx: # "elastic" : add any Elastic DSL "Column" __init__ kwargs here, they will be handed raw # to the Column __init__ # "elastictype" : add a more specific elasticserach_dsl type definition (Text instead of string) # the two special keys will be removed from the schema at the end of this # decorator. # # # now set the right elastic types for the doc # from datetime import datetime #from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, Integer\ # Float, Byte, Text, analyzer, InnerObjectWrapper, Completion import elasticsearch_dsl for elem in cls.schema.keys(): #print(elem) # the raw Column __init__ parameters dict elastic=cls.schema[elem].get("elastic", {}) if cls.schema[elem]["type"] == "integer": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "float": setattr(cls, elem, elasticsearch_dsl.Float(**elastic)) elif cls.schema[elem]["type"] == "string": setattr(cls, elem, elasticsearch_dsl.Text(**elastic)) elif cls.schema[elem]["type"] == "bool": setattr(cls, elem, elasticsearch_dsl.Boolean(**elastic)) elif cls.schema[elem]["type"] == "date": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "datetime": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "number": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "binary": setattr(cls, elem, elasticsearch_dsl.Byte(**elastic)) elif cls.schema[elem]["type"] == "list": setattr(cls, elem, elasticsearch_dsl.Keyword(**elastic)) else: raise Exception("Wrong Datatype in schema") #print(" .. removing the schema (raw) elastic key(s)") cls.schema[elem].pop("elastic", None) cls.schema[elem].pop("elastictype", None) return cls
class GeoCoding(PluginBase): """Class that will attempt to geotag a tweet.""" data_schema = { 'geotagged': es.Boolean(), 'location': es.Object(Location), 'coordinates': es.GeoPoint(), } def __init__(self, *args, **kwargs) -> None: """Setup Carmen geotagging options, then init super.""" with warnings.catch_warnings(): # The default setup of carmen appears to raise several warnings, we # suppress them with the catch_warnings context manager. warnings.simplefilter("ignore") resolver_options = {'place': {'allow_unknown_locations': True}} self.geotagger = get_resolver(options=resolver_options) self.geotagger.load_locations() self.location_resolver = LocationEncoder() super().__init__(*args, **kwargs) # type: ignore def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]: """ Attempt to geotag the tweet data. Returns the tweet with new data if any was resolved and will set geotagged according to success or failure. """ LOG.debug('Attempting to geotag tweet') tweet_location = self.geotagger.resolve_tweet(tweet_json['_raw']) tweet_json['geotagged'] = False if tweet_location: LOG.debug(' This tweet includes location information') tweet_json['location'] = self.location_resolver.default( tweet_location[1]) if 'latitude' in tweet_json[ 'location'] and 'longitude' in tweet_json['location']: tweet_json['coordinates'] = { 'lat': tweet_json['location']['latitude'], 'lon': tweet_json['location']['longitude'], } tweet_json['geotagged'] = True LOG.debug('Geotagging completed!') return tweet_json
class DataDocType(es.Document): """Elasticsearch test model""" first_name = es.Keyword() last_name = es.Keyword() city = es.Text() skills = es.Keyword() birthday = es.Date() is_active = es.Boolean() score = es.Integer() location = es.GeoPoint() description = es.Text() class Index: name = 'test'
class TopicModellingIndex(es.Document): corpus = es.Keyword() source = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() has_topic_info = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() datetime_from = es.Date() datetime_to = es.Date() algorithm = es.Keyword() number_of_topics = es.Integer() hierarchical = es.Boolean() meta_parameters = es.Object() perplexity = es.Float() purity = es.Float() contrast = es.Float() coherence = es.Float() tau_smooth_sparse_theta = es.Float() tau_smooth_sparse_phi = es.Float() tau_decorrelator_phi = es.Float() tau_coherence_phi = es.Float() topics = es.Nested(Topic) is_actualizable = es.Boolean() class Index: name = ES_INDEX_TOPIC_MODELLING using = ES_CLIENT
class Text(es.InnerDoc): """Simple Elasticsearch DSL mapping of the text data this plugin will return.""" full_text = es.Text() pattern_polarity = es.Float() pattern_subjectivity = es.Float() short_text = es.Text() translated = es.Text() truncated = es.Boolean() tweet_length = es.Integer() vader_compound = es.Float() vader_compound_inverted = es.Float() vader_negative = es.Float() vader_neutral = es.Float() vader_positive = es.Float()
class BaseDocument(dsl.DocType): """Base document class to build ElasticSearch documents. This is standard ``elasticsearch-dsl`` ``DocType`` class with already added fields for handling permissions. """ #: list of user ids with view permission on the object users_with_permissions = dsl.Keyword(multi=True) #: list of group ids with view permission on the object groups_with_permissions = dsl.Keyword(multi=True) #: identifies if object has public view permission assigned public_permission = dsl.Boolean()
class Node(es.DocType): """ Elastic document describing user """ node_type = es.Keyword() objectID = es.Keyword() name = es.Text( fielddata=True, analyzer=autocomplete ) user = es.Object( fields={ 'id': es.Keyword(), 'name': es.Text( fielddata=True, analyzer=autocomplete) } ) description = es.Text() is_free = es.Boolean() project = es.Object( fields={ 'id': es.Keyword(), 'name': es.Keyword(), 'url': es.Keyword(), } ) media = es.Keyword() picture = es.Keyword() tags = es.Keyword(multi=True) license_notes = es.Text() created_at = es.Date() updated_at = es.Date() class Meta: index = 'nodes'
class EmbeddingIndex(es.Document): corpus = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() by_unit = es.Keyword() # Token/Word/Sentence/Text algorithm = es.Keyword() pooling = es.Keyword() meta_parameters = es.Object() class Index: name = ES_INDEX_EMBEDDING using = ES_CLIENT
class KadastraalSubject(es.DocType): naam = es.Text( analyzer=analyzers.naam, fields={ 'raw': es.Keyword(), 'ngram': es.Text( analyzer=analyzers.kad_sbj_naam, search_analyzer=analyzers.kad_obj_aanduiding_keyword)}) natuurlijk_persoon = es.Boolean() geslachtsnaam = es.Text(analyzer=analyzers.naam) order = es.Integer() subtype = es.Keyword() _display = es.Keyword() class Index: name = settings.ELASTIC_INDICES['BRK_SUBJECT']
class META_DTM(es.Document): meta_name = es.Keyword() volume_days = es.Float() delta_days = es.Float() reset_index = es.Boolean() from_date = es.Date() to_date = es.Date() class Index: name = ES_INDEX_META_DTM using = ES_CLIENT settings = { "number_of_shards": 1, "number_of_replicas": 1, } mappings = { "properties": { "meta_name": { "type": "keyword", }, "volume_days": { "type": "float", }, "delta_days": { "type": "float", }, "reset_index": { "type": "boolean", }, "from_date": { "type": "date" }, "to_date": { "type": "date" } }, }
class Inschrijving(es.Document): """ Elastic data of 'vestigingen' or 'mac' from handelsregister """ maatschappelijke_activiteit_id = es.Keyword() vestiging_id = es.Keyword() dataset = es.Keyword() kvk_nummer = es.Keyword() handelsnaam = es.Keyword() datum_aanvang = es.Date() eigenaar_naam = es.Keyword() eigenaar_id = es.Keyword() non_mailing = es.Boolean() aantal_werkzame_personen = es.Integer() rechtsvorm = es.Keyword() # Address information bezoekadres_volledig_adres = es.Keyword() bezoekadres_correctie = es.Boolean() bezoekadres_afgeschermd = es.Boolean() bezoekadres_openbare_ruimte = es.Keyword() bezoekadres_huisnummer = es.Integer() bezoekadres_huisletter = es.Keyword() bezoekadres_huisnummertoevoeging = es.Keyword() bezoekadres_postcode = es.Keyword() bezoekadres_plaats = es.Keyword() bezoekadres_buurt_code = es.Keyword() bezoekadres_buurt_naam = es.Keyword() bezoekadres_buurtcombinatie_code = es.Keyword() bezoekadres_buurtcombinatie_naam = es.Keyword() bezoekadres_ggw_code = es.Keyword() bezoekadres_ggw_naam = es.Keyword() bezoekadres_gsg_naam = es.Keyword() bezoekadres_stadsdeel_code = es.Keyword() bezoekadres_stadsdeel_naam = es.Keyword() postadres_volledig_adres = es.Keyword() postadres_correctie = es.Boolean() postadres_afgeschermd = es.Boolean() postadres_openbare_ruimte = es.Keyword() postadres_huisnummer = es.Integer() postadres_huisletter = es.Keyword() postadres_huisnummertoevoeging = es.Keyword() postadres_postcode = es.Keyword() postadres_plaats = es.Keyword() # And the bag numid bag_numid = es.Keyword() adresseerbaar_object_id = identificatie = es.Keyword() centroid = es.GeoPoint() # Categores hoofdcategorie = es.Keyword(multi=True) subcategorie = es.Keyword(multi=True) # SBI codes sbi_code = es.Text( multi=True, fielddata=True, analyzer=autocomplete, ) sbi_omschrijving = es.Keyword(multi=True) sbi_l1 = es.Keyword(multi=True) sbi_l2 = es.Keyword(multi=True) sbi_l3 = es.Keyword(multi=True) sbi_l4 = es.Keyword(multi=True) sbi_l5 = es.Keyword(multi=True) # bijzondere rechtstoestand # status = es.Keyword() bijzondere_rechtstoestand = es.Keyword() class Meta: all = es.MetaField(enabled=False) doc_type = 'vestiging' class Index: doc_type = 'vestiging' name = settings.ELASTIC_INDICES['DS_HR_INDEX']
class Job(es.DocType): class Meta: index = 'jobs' doc_type = 'job-offer' french_elision = es.token_filter('french_elision', type='elision', articles_case=True, articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ]) french_stopwords = es.token_filter('french_stopwords', type='stop', stopwords='_french_') # Do not include this filter if keywords is empty french_keywords = es.token_filter('french_keywords', type='keyword_marker', keywords=[]) french_stemmer = es.token_filter('french_stemmer', type='stemmer', language='light_french') french_analyzer = es.analyzer( 'french_analyzer', tokenizer='standard', filter=[ 'lowercase', 'asciifolding', french_elision, french_stopwords, # french_keywords, french_stemmer ], char_filter=['html_strip']) technologies_tokenizer = es.tokenizer('comma_tokenizer', type='pattern', pattern=' |,|, ') technologies_synonyms_filter = es.token_filter( 'technologies_synonyms', type='synonym', synonyms=[ 'c => c_language', 'c++, cpp => cpp_language', 'c/c++, c/cpp => c_language', 'c/c++, c/cpp => cpp_language', 'c#, c♯, csharp => csharp_language', 'f#, f♯, fsharp => fsharp_language', 'c#, c♯, csharp => dotnet', 'f#, f♯, fsharp => dotnet', '.net => dotnet' ]) technologies_analyzer = es.analyzer( 'technologies_analyzer', tokenizer=technologies_tokenizer, filter=['lowercase', 'asciifolding', technologies_synonyms_filter]) company_name_analyzer = es.analyzer('company_name_analyzer', tokenizer='standard', filter=['lowercase', 'asciifolding']) id = es.Integer() url = es.String(index='no') source = es.String(index='not_analyzed') title = es.String( analyzer=french_analyzer, fields={'technologies': es.String(analyzer=technologies_analyzer)}) description = es.String( analyzer=french_analyzer, fields={'technologies': es.String(analyzer=technologies_analyzer)}) company = es.String(analyzer=company_name_analyzer) company_url = es.String(index='no') address = es.String(analyzer=french_analyzer) address_is_valid = es.Boolean() tags = es.Nested(doc_class=Tag, properties=dict(tag=es.String(index='not_analyzed'), weight=es.Integer())) publication_datetime = es.Date() publication_datetime_is_fake = es.Boolean() crawl_datetime = es.Date() geolocation = es.GeoPoint() geolocation_is_valid = es.Boolean() def __init__(self, meta=None, **kwargs): super(Job, self).__init__(meta, **kwargs) self._doc_type.index = compute_index_name(self.index) @property def index(self): return self._doc_type.index @property def doc_type(self): return self._doc_type.name @property def published(self): return format_date(self.publication_datetime, locale='FR_fr') @property def published_in_days(self): delta = datetime.now() - self.publication_datetime # TODO: bugfix return format_timedelta(delta, granularity='day', locale='en_US') @property def alltags(self): tags = [] if self.tags: for tag in self.tags: if tag['tag'] not in condition_tags: tags.append(Tag2(tag['tag'], tag['weight'])) return tags @property def condition_tags(self): tags = [] if self.tags: for tag in self.tags: if tag['tag'] in condition_tags: tag = Tag2(tag['tag'], tag['weight'], Tag2.get_css(tag['tag'])) tags.append(tag) return tags
class DictionaryWord(es.Document): dictionary = es.Keyword() word = es.Keyword() word_normal = es.Keyword() is_in_pymorphy2_dict = es.Boolean() is_multiple_normals_in_pymorphy2 = es.Boolean() is_stop_word = es.Boolean() is_latin = es.Boolean() is_kazakh = es.Boolean() n_gram_len = es.Integer() pos_tag = es.Keyword() word_len = es.Integer() word_frequency = es.Integer() word_normal_frequency = es.Integer() document_frequency = es.Integer() document_normal_frequency = es.Integer() word_frequency_relative = es.Float() word_normal_frequency_relative = es.Float() document_frequency_relative = es.Float() document_normal_frequency_relative = es.Float() word_first_capital_ratio = es.Float() word_normal_first_capital_ratio = es.Float() class Index: name = ES_INDEX_DICTIONARY_WORD # f"{ES_INDEX_DICTIONARY_WORD}_{name}{_temp}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, } mappings = { "properties": { "dictionary": { "type": "keyword", }, "word": { "type": "keyword", }, "word_normal": { "type": "keyword", }, "is_in_pymorphy2_dict": { "type": "boolean", }, "is_multiple_normals_in_pymorphy2": { "type": "boolean", }, "is_stop_word": { "type": "boolean", }, "is_latin": { "type": "boolean", }, "is_kazakh": { "type": "boolean", }, "n_gram_len": { "type": "integer", }, "pos_tag": { "type": "keyword", }, "word_len": { "type": "integer", }, "word_frequency": { "type": "integer", }, "word_normal_frequency": { "type": "integer", }, "document_frequency": { "type": "integer", }, "document_normal_frequency": { "type": "integer", }, "word_frequency_relative": { "type": "float", }, "word_normal_frequency_relative": { "type": "float", }, "document_frequency_relative": { "type": "float", }, "document_normal_frequency_relative": { "type": "float", }, "word_first_capital_ratio": { "type": "float", }, "word_normal_first_capital_ratio": { "type": "float", }, }, }
class Nummeraanduiding(es.DocType): """ All bag objects should have one or more adresses Een nummeraanduiding, in de volksmond ook wel adres genoemd, is een door het bevoegde gemeentelijke orgaan als zodanig toegekende aanduiding van een verblijfsobject, standplaats of ligplaats. [Stelselpedia](http://www.amsterdam.nl/stelselpedia/bag-index/catalogus-bag/objectklasse-2/) """ straatnaam = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram_edge': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) straatnaam_keyword = es.Keyword() straatnaam_nen = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram_edge': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) straatnaam_nen_keyword = es.Keyword() straatnaam_ptt = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram_edge': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard'), 'keyword': es.Keyword(normalizer=analyzers.lowercase), }) straatnaam_ptt_keyword = es.Keyword() adres = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram_edge': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard'), }) comp_address = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) comp_address_nen = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) comp_address_ptt = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) comp_address_pcode = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) huisnummer = es.Integer( fields={'variation': es.Text(analyzer=analyzers.huisnummer)}) toevoeging = es.Text(analyzer=analyzers.toevoeging, fields={'keyword': es.Keyword()}) # to return official bag fields bag_toevoeging = es.Keyword() bag_huisletter = es.Keyword() woonplaats = es.Keyword() postcode = es.Text( analyzer=analyzers.postcode, fields=postcode_fields, ) order = es.Integer() hoofdadres = es.Boolean() status = es.Nested( properties={ 'code': es.Keyword(normalizer=analyzers.lowercase), 'omschrijving': es.Text() }) vbo_status = es.Nested( properties={ 'code': es.Keyword(normalizer=analyzers.lowercase), 'omschrijving': es.Text() }) subtype = es.Keyword() _display = es.Keyword() landelijk_id = es.Text(analyzer=analyzers.autocomplete, fields={ 'raw': es.Keyword(), 'nozero': es.Text(analyzer=analyzers.nozero) }) adresseerbaar_object_id = es.Text( # Is landelijk_id for related verblijfsobject, ligplaats of standplaats analyzer=analyzers.autocomplete, fields={ 'raw': es.Keyword(), 'nozero': es.Text(analyzer=analyzers.nozero) }) class Index: name = settings.ELASTIC_INDICES['NUMMERAANDUIDING']
class ResponseDocType(FjordDocType): id = es_dsl.Integer() happy = es_dsl.Boolean() api = es_dsl.Integer() url = es_dsl.String(index='not_analyzed') url_domain = es_dsl.String(index='not_analyzed') has_email = es_dsl.Boolean() description = es_dsl.String(analyzer='snowball') category = es_dsl.String(index='not_analyzed') description_bigrams = es_dsl.String(index='not_analyzed') description_terms = es_dsl.String(analyzer='standard') user_agent = es_dsl.String(index='not_analyzed') product = es_dsl.String(index='not_analyzed') channel = es_dsl.String(index='not_analyzed') version = es_dsl.String(index='not_analyzed') browser = es_dsl.String(index='not_analyzed') browser_version = es_dsl.String(index='not_analyzed') platform = es_dsl.String(index='not_analyzed') locale = es_dsl.String(index='not_analyzed') country = es_dsl.String(index='not_analyzed') device = es_dsl.String(index='not_analyzed') manufacturer = es_dsl.String(index='not_analyzed') source = es_dsl.String(index='not_analyzed') campaign = es_dsl.String(index='not_analyzed') souce_campaign = es_dsl.String(index='not_analyzed') organic = es_dsl.Boolean() created = es_dsl.Date() docs = ResponseDocTypeManager() class Meta: pass def mlt(self): """Returns a search with a morelikethis query for docs like this""" # Short responses tend to not repeat any words, so then MLT # returns nothing. This fixes that by setting min_term_freq to # 1. Longer responses tend to repeat important words, so we can # set min_term_freq to 2. num_words = len(self.description.split(' ')) if num_words > 40: min_term_freq = 2 else: min_term_freq = 1 s = self.search() if self.product: s = s.filter('term', product=self.product) if self.platform: s = s.filter('term', platform=self.platform) s = s.query('more_like_this', fields=['description'], docs=[{ '_index': get_index_name(), '_type': self._doc_type.name, '_id': self.id }], min_term_freq=min_term_freq, stop_words=list(ANALYSIS_STOPWORDS)) return s @classmethod def get_model(cls): return Response @classmethod def public_fields(cls): """Fields that can be publicly-visible .. Note:: Do NOT include fields that have PII in them. """ return ('id', 'happy', 'api', 'url_domain', 'has_email', 'description', 'category', 'description_bigrams', 'user_agent', 'product', 'version', 'platform', 'locale', 'source', 'campaign', 'organic', 'created') @property def truncated_description(self): """Shorten feedback for dashboard view.""" return smart_truncate(self.description, length=500) @classmethod def extract_doc(cls, resp, with_id=True): """Converts a Response to a dict of values This can be used with ``ResponseDocType.from_obj()`` to create a ``ResponseDocType`` object or it can be used for indexing. :arg resp: a Response object :arg with_id: whether or not to include the ``_id`` value--include it when you're bulk indexing :returns: a dict """ doc = { 'id': resp.id, 'happy': resp.happy, 'api': resp.api, 'url': resp.url, 'url_domain': resp.url_domain, 'has_email': bool(resp.user_email), 'description': resp.description, 'user_agent': resp.user_agent, 'product': resp.product, 'channel': resp.channel, 'version': resp.version, 'browser': resp.browser, 'browser_version': resp.browser_version, 'platform': resp.platform, 'locale': resp.locale, 'country': resp.country, 'device': resp.device, 'manufacturer': resp.manufacturer, 'source': resp.source, 'campaign': resp.campaign, 'source_campaign': '::'.join([(resp.source or '--'), (resp.campaign or '--')]), 'organic': (not resp.campaign), 'created': resp.created } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if resp.locale.startswith(u'en') and resp.description: doc['description_bigrams'] = compute_grams(resp.description) else: doc['description_bigrams'] = [] if with_id: doc['_id'] = doc['id'] return doc
class AWSDetailedLineitem(dsl.DocType): class Meta: index = 'awsdetailedlineitem' availability_zone = dsl.String(index='not_analyzed') cost = dsl.Double() un_blended_cost = dsl.Double() item_description = dsl.String(index='not_analyzed') linked_account_id = dsl.String(index='not_analyzed') operation = dsl.String() payer_account_id = dsl.String(index='not_analyzed') pricing_plan_id = dsl.Long() product_name = dsl.String(index='not_analyzed') rate = dsl.Double() un_blended_rate = dsl.Double() rate_id = dsl.Long() record_id = dsl.String(index='not_analyzed') reserved_instance = dsl.Boolean() resource_id = dsl.String(index='not_analyzed') subscription_id = dsl.Long() tag = dsl.Object( properties={ 'key': dsl.String(index='not_analyzed'), 'value': dsl.String(index='not_analyzed') }) usage_end_date = dsl.Date(format='strict_date_optional_time||epoch_millis') usage_quantity = dsl.Double() usage_start_date = dsl.Date( format='strict_date_optional_time||epoch_millis') usage_type = dsl.String(index='not_analyzed') @classmethod @with_cache(ttl=3600 * 3, worker_refresh=True) def keys_has_data(cls, keys, date_from=None, date_to=None): date_to = date_to or datetime.utcnow() s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if date_from: s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return res['hits']['total'] > 0 @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_first_date(cls, keys): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.sort('usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_last_date(cls, keys, limit=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if limit: s = s.filter('range', usage_start_date={'to': limit.isoformat()}) s = s.sort('-usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod def get_first_to_now_date(cls, keys): def from_date_to_today(d): now = datetime.utcnow() while d < now: yield d d += relativedelta(months=1) return list(from_date_to_today(cls.get_first_date(keys))) @classmethod def get_first_to_last_date(cls, keys): def from_date_to_last(d): last = cls.get_last_date(keys) while d < last: yield d d += relativedelta(months=1) return list(from_date_to_last(cls.get_first_date(keys))) @classmethod @with_cache(6 * 3600) def get_available_tags(cls, keys, only_with_data=None, product_name=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if product_name: s = s.filter('term', product_name=product_name) s.aggs.bucket('tag_key', 'terms', field='tag.key') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [] for tag in res['aggregations']['tag_key']['buckets']: if tag['key'].startswith('user:'******'key'].split(':')[1] if not only_with_data or name in AWSStat.latest_hourly_cpu_usage_by_tag( only_with_data )['tags'] or name in AWSStat.latest_daily_cpu_usage_by_tag( only_with_data)['tags']: tags.append(name) tags.sort() return dict(tags=tags) @classmethod @with_cache(ttl=6 * 3600) def get_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') agg = s.aggs.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in res['aggregations']['tag_value']['buckets']] return dict(tags=tags, total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache(ttl=6 * 3600) def get_cost(cls, keys, date_from, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return dict(total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache() def get_monthly_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('total_cost', 'sum', field='cost') agg = agg.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) months = [{ 'month': interval['key_as_string'].split('T')[0][:-3], 'tags': [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in interval['tag_value']['buckets']], 'total_cost': interval['total_cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=months) @classmethod @with_cache() def get_cost_by_product(cls, key, date_from=None, date_to=None, without_discount=False, only_discount=False, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if without_discount: s = s.query( 'bool', filter=[ ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500') ]) if only_discount: s = s.filter('term', item_description='PAR_APN_ProgramFee_2500') agg = s.aggs.bucket('products', 'terms', field='product_name', order={'cost': 'desc'}, size=size) agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) products = [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in res['aggregations']['products']['buckets']] return dict(products=products) @classmethod @with_cache() def get_cost_by_region(cls, keys, tagged=False, byaccount=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs if byaccount: agg = agg.bucket('accounts', 'terms', field='linked_account_id') agg = agg.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('regions', 'terms', field='availability_zone', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0) return res['aggregations'] @classmethod @with_cache() def get_monthly_cost(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'month': interval['key_as_string'].split('T')[0], 'total_cost': interval['cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache() def get_monthly_cost_by_product(cls, keys, tagged=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) def tagged_cost(bucket, total): total_tag = 0.0 for tag in bucket: total_tag += tag['cost']['value'] yield (tag['key'], tag['cost']['value']) if total != total_tag: yield ('untagged', total - total_tag) res = [{ 'month': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], 'tags': [{ 'name': tag[0], 'cost': tag[1], } for tag in tagged_cost(product['tags']['buckets'], product['cost']['value'])], } for product in interval['products']['buckets']] if tagged else [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache(ttl=4 * 3600) def get_daily_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'day': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(days=res) @classmethod @with_cache(ttl=24 * 3600) def get_yearly_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( month=1, day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(month=12, day=31, hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='year', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'year': interval['key_as_string'][:4], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(years=res) @classmethod @with_cache() def get_cost_by_resource(cls, keys, date_from=None, date_to=None, search=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if search: s = s.query('wildcard', resource_id='*{}*'.format(search)) agg = s.aggs.bucket('resources', 'terms', field='resource_id', order={'cost': 'desc'}, size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) resources = [{ 'resource': resource['key'], 'cost': resource['cost']['value'], } for resource in res['aggregations']['resources']['buckets']] return resources @classmethod def get_monthly_cost_by_resource(cls, resource_ids, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) if resource_ids: s = cls.search() s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('terms', resource_id=list(resource_ids)) agg = s.aggs.bucket('months', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.metric('cost', 'sum', field='cost') r = client.search('awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return { e['key_as_string']: e['cost']['value'] for e in r['aggregations']['months']['buckets'] } else: return {} @classmethod @with_cache() def get_lambda_usage(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='AWS Lambda') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('resources', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'avg', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') agg = agg.bucket('descriptions', 'terms', field='item_description', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) #return res def _lambda_usage_regb(buckets, endswith): for b in buckets: if b['key'].endswith(endswith): return b['quantity']['value'] usages = [{ 'rid': usage['key'], 'name': usage['key'].split(':')[-1], 'requests': _lambda_usage_regb(usage['types']['buckets'], '-Request'), 'gb_seconds': _lambda_usage_regb(usage['types']['buckets'], '-Lambda-GB-Second'), 'cost': usage['cost']['value'], 'raw_cost': lambdapricing.get_raw_cost([ x['descriptions']['buckets'] for x in usage['types']['buckets'] ]), } for usage in res['aggregations']['resources']['buckets']] return usages @classmethod @with_cache() def get_s3_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod @with_cache() def get_ec2_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod def get_ec2_daily_cost(cls, key): s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg.metric('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) for interval in res['aggregations']['intervals']['buckets']: yield interval['key_as_string'].split( 'T')[0], interval['cost']['value'] @classmethod @with_cache() def get_elb_usage_a_day(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) gib = Fraction(2**30) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("prefix", resource_id="arn:aws:elasticloadbalancing") s = s.sort({"usage_start_date": {"order": "desc"}}) agg = s.aggs.bucket('rid', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) elbs = [{ 'rid': elb['key'], 'cost': elb['cost']['value'] / (date_to - date_from).days, 'hours': float( sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('LoadBalancerUsage') ]) / (date_to - date_from).days), 'bytes': float((sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('Bytes') ]) * gib) / (date_to - date_from).days), } for elb in res['aggregations']['rid']['buckets']] return elbs @classmethod @with_cache() def get_instance_type(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.extra(_source=[ 'usage_start_date', 'usage_type', 'availability_zone', 'resource_id' ]) s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.query('wildcard', usage_type='*BoxUsage:*') s = s.filter('exists', field='resource_id') s = s.sort({"usage_start_date": {"order": "desc"}}) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=10000, request_timeout=60) def cut_region_name(s): return s[:-1] if s[-1].isalpha() else s types = [] refs = {} def add_in_types(type, rid): ref_tuple = (type['hour'], type['instance'], type['region']) if ref_tuple in refs: refs[ref_tuple]['rids'].append(rid) refs[ref_tuple]['ridCount'] += 1 return type['rids'] = [rid] types.append(type) refs[ref_tuple] = types[-1] for r in res['hits']['hits']: elem = { 'hour': r['_source']['usage_start_date'], 'instance': r['_source']['usage_type'].split(':')[1], 'region': cut_region_name(r['_source']['availability_zone']) if 'availability_zone' in r['_source'] else 'unknown', 'ridCount': 1, } add_in_types(elem, r['_source']['resource_id']) return types @classmethod @with_cache() def get_instance_hour(cls, keys, date_from=None, date_to=None, min_hour=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.filter('prefix', resource_id='i-') s = s.query('wildcard', usage_type='*BoxUsage*') agg = s.aggs.bucket('resource_id', 'terms', field='resource_id', size=0x7FFFFFFF) agg.bucket('days', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) instance_list = [] for instance in res['aggregations']['resource_id']['buckets']: tmp_hours = [] for day in instance['days']['buckets']: tmp_hours.append(day['doc_count']) avg_hours = sum(tmp_hours) / float(len(tmp_hours)) if not min_hour or avg_hours >= min_hour: instance_list.append(dict(id=instance['key'], hours=avg_hours)) return sorted(instance_list, key=lambda x: x['hours'], reverse=True) @classmethod @with_cache() def get_s3_buckets_per_tag(cls, keys): def _check_if_in_list(dict_list, value, key): return next((item for item in dict_list if item[key] == value), None) def _parse_tag_keys_results(res): bucket_tagged = [] for bucket_tag_key in res['aggregations']['tag_key']['buckets']: buff_tag_key = _check_if_in_list(bucket_tagged, bucket_tag_key['key'], 'tag_key') if buff_tag_key is None: buff_tag_key = { "tag_key": bucket_tag_key['key'], "tag_value": [] } buff_tag_key = _parse_tag_values_results( bucket_tag_key, buff_tag_key) bucket_tagged.append(buff_tag_key) return bucket_tagged def _parse_tag_values_results(bucket_tag_key, buff_tag_key): for bucket_tag_value in bucket_tag_key['tag_value']['buckets']: buff_tag_value = _check_if_in_list(buff_tag_key['tag_value'], bucket_tag_value['key'], 'tag_value') if buff_tag_value is None: buff_tag_value = { "tag_value": bucket_tag_value['key'], "s3_buckets": [] } buff_tag_value = _parse_buckets_results( buff_tag_value, bucket_tag_value) buff_tag_key['tag_value'].append(buff_tag_value) return buff_tag_key def _parse_buckets_results(buff_tag_value, bucket_tag_value): for bucket_resource_id in bucket_tag_value['ressource_id'][ 'buckets']: buff_bucket_resource_id = _check_if_in_list( buff_tag_value['s3_buckets'], bucket_resource_id['key'], 'bucket_name') if buff_bucket_resource_id is None: buff_bucket_resource_id = { "bucket_name": bucket_resource_id['key'], "account_id": bucket_resource_id['account_id']['buckets'][0]['key'] } buff_tag_value['s3_buckets'].append(buff_bucket_resource_id) return buff_tag_value s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.query('exists', field="tag") s = s.query('wildcard', item_description="*storage*") agg = s.aggs.bucket('tag_key', 'terms', field="tag.key") agg = agg.bucket('tag_value', 'terms', field='tag.value') agg.bucket('ressource_id', 'terms', field='resource_id').bucket('account_id', 'terms', field='linked_account_id') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) ''' bucket_tagged structure [{ "tag_key" : "KEY", # Unique in list "tag_value": [{ "tag_value": "VALUE", # Unique in list "s3_buckets": [{ "bucket_name": "BUCKET_NAME", "account_id": "ACCOUND_ID" }, {...}] }, {...}] }, {...}] ''' bucket_tagged = _parse_tag_keys_results(res) return bucket_tagged @classmethod @with_cache() def get_s3_bandwidth_info_and_cost_per_name(cls, key, bucket_resource_ids, date_from=None, date_to=None): date_from = date_from or (datetime.utcnow() - relativedelta( month=1)).replace(day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('terms', resource_id=bucket_resource_ids if isinstance( bucket_resource_ids, list) else [bucket_resource_ids]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('wildcard', usage_type="*Bytes") agg = s.aggs.bucket('bucket_name', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('transfer_type', 'terms', field='usage_type') agg.metric('data', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) data = [{ "bucket_name": bucket['key'], "cost": bucket['cost']['value'], "transfer_stats": [{ "type": transfer_stat['key'], "data": transfer_stat['data']['value'] } for transfer_stat in bucket['transfer_type']['buckets']] } for bucket in res['aggregations']['bucket_name']['buckets']] return data
class Company(es.DocType): class Meta: index = 'companies' doc_type = 'company' french_elision = es.token_filter( 'french_elision', type='elision', articles_case=True, articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] ) french_stopwords = es.token_filter( 'french_stopwords', type='stop', stopwords='_french_' ) # Do not include this filter if keywords is empty french_keywords = es.token_filter( 'french_keywords', type='keyword_marker', keywords=[] ) french_stemmer = es.token_filter( 'french_stemmer', type='stemmer', language='light_french' ) french_analyzer = es.analyzer( 'french_analyzer', tokenizer='standard', filter=[ 'lowercase', 'asciifolding', french_elision, french_stopwords, # french_keywords, french_stemmer ], char_filter=['html_strip'] ) technologies_tokenizer = es.tokenizer( 'comma_tokenizer', type='pattern', pattern=' |,|, ' ) technologies_synonyms_filter = es.token_filter( 'technologies_synonyms', type='synonym', synonyms=[ 'c => c_language', 'c++, cpp => cpp_language', 'c/c++, c/cpp => c_language', 'c/c++, c/cpp => cpp_language', 'c#, c♯, csharp => csharp_language', 'f#, f♯, fsharp => fsharp_language', 'c#, c♯, csharp => dotnet', 'f#, f♯, fsharp => dotnet', '.net => dotnet' ] ) technologies_analyzer = es.analyzer( 'technologies_analyzer', tokenizer=technologies_tokenizer, filter=[ 'lowercase', 'asciifolding', technologies_synonyms_filter ] ) company_name_analyzer = es.analyzer( 'company_name_analyzer', tokenizer='standard', filter=[ 'lowercase', 'asciifolding' ] ) id = es.String(index='no') name = es.String(analyzer=french_analyzer) description = es.String( analyzer=french_analyzer, fields={ 'technologies': es.String(analyzer=technologies_analyzer) } ) technologies = es.String(analyzer=technologies_analyzer) url = es.String(index='no') logo_url = es.String(index='no') address = es.String(analyzer=french_analyzer) address_is_valid = es.Boolean() email = es.String(index='no') phone = es.String(index='no') geolocation = es.GeoPoint() geolocation_is_valid = es.Boolean() def __init__(self, meta=None, **kwargs): super(Company, self).__init__(meta, **kwargs) self._doc_type.index = compute_index_name(self.index) @property def index(self): return self._doc_type.index @property def doc_type(self): return self._doc_type.name
class Inschrijving(es.DocType): _display = es.Keyword() _kvk_display = es.Keyword() doctype = es.Keyword() kvk_nummer = es.Text(analyzer=analyzers.autocomplete, fields={ 'raw': es.Keyword(), 'nozero': es.Text(analyzer=analyzers.nozero) }) vestigingsnummer = es.Text(analyzer=analyzers.autocomplete, fields={ 'raw': es.Keyword(), 'nozero': es.Text(analyzer=analyzers.nozero), 'int': es.Integer() }) hoofdvestiging = es.Boolean() sbi = es.Nested( properties={ 'code': es.Text(analyzer=analyzers.autocomplete, fields={'raw': es.Keyword()}), 'omschrijving': es.Text(), }) naam = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) handelsnamen = es.Nested( properties={ 'naam': es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) }) postadres = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) bezoekadres = es.Text(analyzer=analyzers.adres, fields={ 'raw': es.Keyword(), 'ngram': es.Text(analyzer=analyzers.autocomplete, search_analyzer='standard') }) bezoekadres_correctie = es.Boolean() # hoofdvestiging centroid = es.GeoPoint() class Index: name = settings.ELASTIC_INDICES['HR']
class ValidationJob(elasticsearch_dsl.Document): """ Object for validation data. TODO: Can this be merged with TrainingJob, with a common parent object? """ id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() params = elasticsearch_dsl.Text() raw_log = elasticsearch_dsl.Text() # Metrics purported_acc = elasticsearch_dsl.Float() purported_loss = elasticsearch_dsl.Float() purported_sensitivity = elasticsearch_dsl.Float() avg_test_acc = elasticsearch_dsl.Float() avg_test_loss = elasticsearch_dsl.Float() avg_test_sensitivity = elasticsearch_dsl.Float() avg_test_specificity = elasticsearch_dsl.Float() avg_test_true_pos = elasticsearch_dsl.Float() avg_test_false_neg = elasticsearch_dsl.Float() avg_test_auc = elasticsearch_dsl.Float() best_test_acc = elasticsearch_dsl.Float() best_test_loss = elasticsearch_dsl.Float() best_test_sensitivity = elasticsearch_dsl.Float() best_test_specificity = elasticsearch_dsl.Float() best_test_true_pos = elasticsearch_dsl.Float() best_test_false_neg = elasticsearch_dsl.Float() best_test_auc = elasticsearch_dsl.Float() best_end_val_acc = elasticsearch_dsl.Float() best_end_val_loss = elasticsearch_dsl.Float() best_max_val_acc = elasticsearch_dsl.Float() best_max_val_loss = elasticsearch_dsl.Float() # Params batch_size = elasticsearch_dsl.Integer() val_split = elasticsearch_dsl.Float() seed = elasticsearch_dsl.Integer() rotation_range = elasticsearch_dsl.Float() width_shift_range = elasticsearch_dsl.Float() height_shift_range = elasticsearch_dsl.Float() shear_range = elasticsearch_dsl.Float() zoom_range = elasticsearch_dsl.Keyword() horizontal_flip = elasticsearch_dsl.Boolean() vertical_flip = elasticsearch_dsl.Boolean() dropout_rate1 = elasticsearch_dsl.Float() dropout_rate2 = elasticsearch_dsl.Float() data_dir = elasticsearch_dsl.Keyword() gcs_url = elasticsearch_dsl.Keyword() mip_thickness = elasticsearch_dsl.Integer() height_offset = elasticsearch_dsl.Integer() pixel_value_range = elasticsearch_dsl.Keyword() # We need to keep a list of params for the parser because # we can't use traditional approaches to get the class attrs params_to_parse = [ 'batch_size', 'val_split', 'seed', 'rotation_range', 'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range', 'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2', 'data_dir', 'gcs_url', 'mip_thickness', 'height_offset', 'pixel_value_range' ] class Index: name = VALIDATION_JOBS