class PlayersInGame(elasticsearch_dsl.InnerDoc): """Players in game search model""" north = elasticsearch_dsl.Object(Player) east = elasticsearch_dsl.Object(Player) south = elasticsearch_dsl.Object(Player) west = elasticsearch_dsl.Object(Player)
class ClusterSource(es.Document): name = es.Keyword() clusters = es.Object() clustering_params = es.Object() class Index: name = ES_INDEX_SOURCE_CLUSTERS using = ES_CLIENT settings = { "number_of_shards": 1, "number_of_replicas": 1, } mappings = { "properties": { "name": { "type": "keyword", }, "clusters": { "type": "object", }, "clustering_params": { "type": "object", }, }, }
def gendocu(stream, estype='document'): """ Creates the mapping for type document in Elasticsearch :param estype: Name of ES type (defaults to 'document') """ m = dsl.Mapping(estype) # Set properties m.properties.dynamic = 'strict' # Adding mapping m = gencontext(m) m = m.field('@id', 'string', index='not_analyzed') m = m.field('@type', 'string', index='no') m = m.field('dc:contributor', 'string', index='analyzed', analyzer='autocomplete') access = dsl.Object() access = access.property('@type', 'string') access = access.property('@value', 'date', format='dateOptionalTime') m = m.field('dct:issued', access) m = m.field('dct:modified', access) m = m.field('foaf:primaryTopic', dsl.Object().property('@id', 'string', index='not_analyzed')) # Save the mapping in ES pprint(m.to_dict(), stream=stream)
class TopicCombo(es.Document): topics = es.Object() common_docs_ids = es.Keyword() common_docs_num = es.Integer() class Index: name = ES_INDEX_TOPIC_COMBOS # f"{ES_INDEX_TOPIC_COMBOS}_{tm}" using = ES_CLIENT settings = { "number_of_shards": 2, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "topics": { "type": "object" }, "common_docs_ids": { "type": "keyword", }, "common_docs_num": { "type": "integer", }, } }
def test_aggregate_data_schema(): """Verify the behaviour of the ingress.utils.aggregate_data_schema function.""" class Base: # noqa data_schema = {} class Sub1(Base): # noqa data_schema = {'Sub1': True} class Sub2(Base): # noqa data_schema = {'Sub2': True} class SubSub1(Sub2): # noqa data_schema = {'SubSub1': True} class SubSub2(Sub2): # noqa data_schema = {'SubSub2': True} aggregated_schema = iu.aggregate_data_schema(Base, include_defaults=True) assert aggregated_schema == { 'Sub1': True, 'Sub2': True, 'SubSub1': True, 'SubSub2': True, '_raw': es.Object(dynamic=True), 'timestamp': es.Date() }
class Activiteit(es.DocType): ext_id = es.String(index='not_analyzed') naam = es.String(analyzer=dutch_analyzer) beschrijving = es.String(analyzer=dutch_analyzer) bron_link = es.String(index='not_analyzed') tijdstip = es.String(index='not_analyzed') tags = es.String(index='not_analyzed') centroid = es.GeoPoint() locatie = es.Object(doc_class=Locatie, properties={ 'ext_id': es.String(index='not_analyzed'), 'naam': es.String(analyzer=dutch_analyzer), 'centroid': es.GeoPoint(), 'openbare_ruimte_naam': es.String(index='not_analyzed'), 'huisnummer': es.String(index='not_analyzed'), 'huisnummer_toevoeging': es.String(index='not_analyzed'), 'postcode': es.String(index='not_analyzed') })
class Node(es.DocType): """ Elastic document describing user """ node_type = es.Keyword() objectID = es.Keyword() name = es.Text( fielddata=True, analyzer=autocomplete ) user = es.Object( fields={ 'id': es.Keyword(), 'name': es.Text( fielddata=True, analyzer=autocomplete) } ) description = es.Text() is_free = es.Boolean() project = es.Object( fields={ 'id': es.Keyword(), 'name': es.Keyword(), 'url': es.Keyword(), } ) media = es.Keyword() picture = es.Keyword() tags = es.Keyword(multi=True) license_notes = es.Text() created_at = es.Date() updated_at = es.Date() class Meta: index = 'nodes'
class TestSearchDocument(BaseDocument): # pylint: disable=no-member name = dsl.String() num = dsl.Integer() json = dsl.Object() class Meta: index = 'test_search'
def gencontext(mapobj): context = dsl.Object() namespaces = [ 'bibo', 'dbp', 'dc', 'dct', 'foaf', 'rdau', 'rdf', 'rdfs', 'skos', 'xsd' ] for token in namespaces: context = context.property(token, 'string', index='no') return mapobj.field('@context', context)
def deep_field_factory(field): if field.is_relation and (field.many_to_one or field.one_to_one): props = {} for f in field.related_model._meta.get_fields(): nested_field = deep_field_factory(f) if nested_field is not None: props[f.name] = nested_field return dsl.Object(properties=props) else: return document_field(field)
class GameSummary(elasticsearch_dsl.Document): """Game search model""" id = elasticsearch_dsl.Text() name = elasticsearch_dsl.Text() isPublic = elasticsearch_dsl.Boolean() players = elasticsearch_dsl.Object(PlayersInGame) class Index: # pylint: disable=missing-class-docstring name = "games"
class TestSearchDocument(BaseDocument): id = dsl.Integer() name = dsl.Text(fielddata=True) num = dsl.Integer() date = dsl.Date() json = dsl.Object() field_name = Name() field_process_type = ProcessType() none_test = dsl.Integer() class Index: name = "test_search"
class TestSearchDocument(BaseDocument): # pylint: disable=no-member id = dsl.Integer() # pylint: disable=invalid-name name = dsl.String() num = dsl.Integer() json = dsl.Object() field_name = Name() field_process_type = ProcessType() none_test = dsl.Integer() class Meta: index = 'test_search'
class GeoCoding(PluginBase): """Class that will attempt to geotag a tweet.""" data_schema = { 'geotagged': es.Boolean(), 'location': es.Object(Location), 'coordinates': es.GeoPoint(), } def __init__(self, *args, **kwargs) -> None: """Setup Carmen geotagging options, then init super.""" with warnings.catch_warnings(): # The default setup of carmen appears to raise several warnings, we # suppress them with the catch_warnings context manager. warnings.simplefilter("ignore") resolver_options = {'place': {'allow_unknown_locations': True}} self.geotagger = get_resolver(options=resolver_options) self.geotagger.load_locations() self.location_resolver = LocationEncoder() super().__init__(*args, **kwargs) # type: ignore def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]: """ Attempt to geotag the tweet data. Returns the tweet with new data if any was resolved and will set geotagged according to success or failure. """ LOG.debug('Attempting to geotag tweet') tweet_location = self.geotagger.resolve_tweet(tweet_json['_raw']) tweet_json['geotagged'] = False if tweet_location: LOG.debug(' This tweet includes location information') tweet_json['location'] = self.location_resolver.default( tweet_location[1]) if 'latitude' in tweet_json[ 'location'] and 'longitude' in tweet_json['location']: tweet_json['coordinates'] = { 'lat': tweet_json['location']['latitude'], 'lon': tweet_json['location']['longitude'], } tweet_json['geotagged'] = True LOG.debug('Geotagging completed!') return tweet_json
class EmbeddingIndex(es.Document): corpus = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() by_unit = es.Keyword() # Token/Word/Sentence/Text algorithm = es.Keyword() pooling = es.Keyword() meta_parameters = es.Object() class Index: name = ES_INDEX_EMBEDDING using = ES_CLIENT
def aggregate_data_schema( base_class: Type, include_defaults: bool = True, ) -> Dict[str, Any]: """Iterate through imported plugins and create an ingress mapping to process the data with.""" mapping: Dict = {} for subclass in find_subclasses(base_class): subclass_data_schema = None try: subclass_data_schema = getattr(subclass, 'data_schema') except AttributeError: continue if subclass_data_schema: mapping.update(subclass_data_schema) if include_defaults: mapping['_raw'] = es.Object(dynamic=True) mapping['timestamp'] = es.Date() return mapping
class TopicModellingIndex(es.Document): corpus = es.Keyword() source = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() has_topic_info = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() datetime_from = es.Date() datetime_to = es.Date() algorithm = es.Keyword() number_of_topics = es.Integer() hierarchical = es.Boolean() meta_parameters = es.Object() perplexity = es.Float() purity = es.Float() contrast = es.Float() coherence = es.Float() tau_smooth_sparse_theta = es.Float() tau_smooth_sparse_phi = es.Float() tau_decorrelator_phi = es.Float() tau_coherence_phi = es.Float() topics = es.Nested(Topic) is_actualizable = es.Boolean() class Index: name = ES_INDEX_TOPIC_MODELLING using = ES_CLIENT
class Customer(ArchivingDocType): """Model a customer.""" name = dsl.Keyword() permissions = dsl.Object() cycles = dsl.Object() class Meta: index = auth_index._name @classmethod def get_by_name(cls, name): # type: (str, dsl.Index) -> reles.auth.models.Customer """Get the first customer with the given name.""" response = cls.search(index=auth_index._name).filter( 'term', name=name ).execute() if response.hits.total == 0: raise NotFoundError( 'There is no customer with name \'{}\''.format(name) ) elif response.hits.total == 1: return response[0] else: raise ConflictError( 'Inconsistent data detected: there are {} customers with name' ' \'{}\': {}'.format( response.hits.total, name, [user.meta.id for user in response.hits], ) ) @classmethod def charge_cycles(cls, customer_id, target, cycles): es = dsl.connections.connections.get_connection(cls._doc_type.using) try: return es.update( index=cls._doc_type.index, doc_type=cls._doc_type.name, id=customer_id, body={ 'script': { 'file': 'charge_cycles', 'lang': 'groovy', 'params': { 'index': target, 'cycles': cycles, } } } ) except elasticsearch.NotFoundError: app.logger.debug( 'Failed to charge non-existent customer `%s` with %d cycles for' ' index `%s`', customer_id, cycles, target ) raise NotFoundError('Invalid customer') def add_permissions(self, permissions): """Add the given permissions.""" for index, added in permissions.items(): self.permissions[index] = list( set(self.permissions.to_dict().get(index, [])).union(added) ) def remove_permissions(self, permissions): """Remove the given permissions.""" for index, removed in permissions.items(): self.permissions[index] = list( set(self.permissions.to_dict().get(index, [])).difference(removed) ) def save(self, using=None, index=None, validate=True, **kwargs): """Save a customer instance.""" self._update_aliases() super(Customer, self).save(using, index, validate, **kwargs) def _update_aliases(self): alias_actions = [] # delete aliases of removed permissions # this encodes knowledge about how aliases are formatted affected_indexes = app.es.indices.get_alias('*_%s_*' % self.name) for index in affected_indexes: permissions_by_index = self.permissions.to_dict().get(index, []) for alias in affected_indexes[index]['aliases']: _, customer, permission = unalias(alias) if customer == self.name and permission not in permissions_by_index: alias_actions.append( self._build_alias_action('remove', index, permission) ) # create aliases for added permissions for index in self.permissions: for permission in self.permissions[index]: if not app.es.indices.exists_alias(name=get_alias(index, self.name, permission)): alias_actions.append( self._build_alias_action('add', index, permission) ) if alias_actions: app.es.indices.update_aliases(body={'actions': alias_actions}) def _build_alias_action(self, action, index, permission): return { action: { 'index': index, 'alias': get_alias(index, self.name, permission) } } def refresh(self): """Sync the instance with the ES.""" self.__dict__.update(self.get(self.meta.id).__dict__)
def genbibres(stream, estype='bibliographicResource'): """ Creates the mapping for type bibliographicResource in Elasticsearch :param estype: Name of ES type (defaults to 'bibliographicResource') """ m = dsl.Mapping(estype) # Set properties m.properties.dynamic = 'strict' # Adding mapping m = gencontext(m) m = m.field('@id', 'string', index='not_analyzed') m = m.field('@type', 'string', index='no') m = m.field('bibo:edition', 'string', index='analyzed') m = m.field('bibo:isbn10', 'string', index='not_analyzed') m = m.field('bibo:isbn13', 'string', index='not_analyzed') m = m.field('bibo:issn', 'string', index='not_analyzed') m = m.field('dbp:originalLanguage', dsl.Object().property('@id', 'string', index='not_analyzed')) contrib = dsl.Nested() contrib = contrib.property('@id', dsl.String(index='no')) contrib = contrib.property('@type', dsl.String(index='no')) contrib = contrib.property('dbp:birthYear', dsl.String(index='not_analyzed')) contrib = contrib.property('dbp:deathYear', dsl.String(index='not_analyzed')) contrib = contrib.property('foaf:firstName', dsl.String(index='analyzed')) contrib = contrib.property('foaf:lastName', dsl.String(index='analyzed')) contrib = contrib.property('foaf:name', dsl.String(index='analyzed')) contrib = contrib.property('rdfs:label', dsl.String(index='analyzed')) contrib = contrib.property('skos:note', dsl.String(index='analyzed')) m = m.field('dc:contributor', contrib) m = m.field('dc:format', 'string', index='analyzed') m = m.field('dct:alternative', 'string', index='analyzed', fields={'folded': dsl.String(analyzer='text_folded')}) m = m.field('dct:bibliographicCitation', 'string', index='analyzed', analyzer='standard') m = m.field('dct:hasPart', 'string', index='analyzed') m = m.field('dct:isPartOf', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('dct:issued', 'string', index='analyzed') m = m.field('dct:language', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('dct:subject', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('dct:title', 'string', index='analyzed', fields={'folded': dsl.String(analyzer='text_folded')}) m = m.field('rdau:contentType', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('rdau:dissertationOrThesisInformation', 'string', index='analyzed') m = m.field('rdau:mediaType', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('rdau:modeOfIssuance', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('rdau:noteOnResource', 'string', index='not_analyzed') m = m.field('rdau:placeOfPublication', dsl.Object().property('@id', 'string', index='not_analyzed')) m = m.field('rdau:publicationStatement', 'string', index='analyzed') m = m.field( 'rdfs:isDefinedBy', dsl.Object().property('@id', 'string', index='analyzed', analyzer='extr_id')) # Save the mapping in ES pprint(m.to_dict(), stream=stream)
if options.dry_run: for hit in document_generator: logging.info(hit) success = True else: success, _ = bulk(es, gen, max_retries=20, initial_backoff=2, max_backoff=3600) return success machine_ad = edsl.Mapping.from_es( doc_type="machine_ad", index=options.indexname, using=es ) if not "claims" in machine_ad or not "failed" in machine_ad.to_dict()['machine_ad']['properties']['claims']['properties']: machine_ad.field( "jobs", edsl.Object(properties={status: edsl.Text(multi=True) for status in STATUSES}), ) machine_ad.field( "claims", edsl.Object( properties={ status: edsl.Object( properties={resource: edsl.Float() for resource in RESOURCES} ) for status in STATUSES } ), ) machine_ad.field( "occupancy", edsl.Object(
class AWSDetailedLineitem(dsl.DocType): class Meta: index = 'awsdetailedlineitem' availability_zone = dsl.String(index='not_analyzed') cost = dsl.Double() un_blended_cost = dsl.Double() item_description = dsl.String(index='not_analyzed') linked_account_id = dsl.String(index='not_analyzed') operation = dsl.String() payer_account_id = dsl.String(index='not_analyzed') pricing_plan_id = dsl.Long() product_name = dsl.String(index='not_analyzed') rate = dsl.Double() un_blended_rate = dsl.Double() rate_id = dsl.Long() record_id = dsl.String(index='not_analyzed') reserved_instance = dsl.Boolean() resource_id = dsl.String(index='not_analyzed') subscription_id = dsl.Long() tag = dsl.Object( properties={ 'key': dsl.String(index='not_analyzed'), 'value': dsl.String(index='not_analyzed') }) usage_end_date = dsl.Date(format='strict_date_optional_time||epoch_millis') usage_quantity = dsl.Double() usage_start_date = dsl.Date( format='strict_date_optional_time||epoch_millis') usage_type = dsl.String(index='not_analyzed') @classmethod @with_cache(ttl=3600 * 3, worker_refresh=True) def keys_has_data(cls, keys, date_from=None, date_to=None): date_to = date_to or datetime.utcnow() s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if date_from: s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return res['hits']['total'] > 0 @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_first_date(cls, keys): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.sort('usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_last_date(cls, keys, limit=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if limit: s = s.filter('range', usage_start_date={'to': limit.isoformat()}) s = s.sort('-usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod def get_first_to_now_date(cls, keys): def from_date_to_today(d): now = datetime.utcnow() while d < now: yield d d += relativedelta(months=1) return list(from_date_to_today(cls.get_first_date(keys))) @classmethod def get_first_to_last_date(cls, keys): def from_date_to_last(d): last = cls.get_last_date(keys) while d < last: yield d d += relativedelta(months=1) return list(from_date_to_last(cls.get_first_date(keys))) @classmethod @with_cache(6 * 3600) def get_available_tags(cls, keys, only_with_data=None, product_name=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if product_name: s = s.filter('term', product_name=product_name) s.aggs.bucket('tag_key', 'terms', field='tag.key') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [] for tag in res['aggregations']['tag_key']['buckets']: if tag['key'].startswith('user:'******'key'].split(':')[1] if not only_with_data or name in AWSStat.latest_hourly_cpu_usage_by_tag( only_with_data )['tags'] or name in AWSStat.latest_daily_cpu_usage_by_tag( only_with_data)['tags']: tags.append(name) tags.sort() return dict(tags=tags) @classmethod @with_cache(ttl=6 * 3600) def get_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') agg = s.aggs.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in res['aggregations']['tag_value']['buckets']] return dict(tags=tags, total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache(ttl=6 * 3600) def get_cost(cls, keys, date_from, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return dict(total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache() def get_monthly_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('total_cost', 'sum', field='cost') agg = agg.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) months = [{ 'month': interval['key_as_string'].split('T')[0][:-3], 'tags': [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in interval['tag_value']['buckets']], 'total_cost': interval['total_cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=months) @classmethod @with_cache() def get_cost_by_product(cls, key, date_from=None, date_to=None, without_discount=False, only_discount=False, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if without_discount: s = s.query( 'bool', filter=[ ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500') ]) if only_discount: s = s.filter('term', item_description='PAR_APN_ProgramFee_2500') agg = s.aggs.bucket('products', 'terms', field='product_name', order={'cost': 'desc'}, size=size) agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) products = [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in res['aggregations']['products']['buckets']] return dict(products=products) @classmethod @with_cache() def get_cost_by_region(cls, keys, tagged=False, byaccount=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs if byaccount: agg = agg.bucket('accounts', 'terms', field='linked_account_id') agg = agg.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('regions', 'terms', field='availability_zone', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0) return res['aggregations'] @classmethod @with_cache() def get_monthly_cost(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'month': interval['key_as_string'].split('T')[0], 'total_cost': interval['cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache() def get_monthly_cost_by_product(cls, keys, tagged=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) def tagged_cost(bucket, total): total_tag = 0.0 for tag in bucket: total_tag += tag['cost']['value'] yield (tag['key'], tag['cost']['value']) if total != total_tag: yield ('untagged', total - total_tag) res = [{ 'month': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], 'tags': [{ 'name': tag[0], 'cost': tag[1], } for tag in tagged_cost(product['tags']['buckets'], product['cost']['value'])], } for product in interval['products']['buckets']] if tagged else [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache(ttl=4 * 3600) def get_daily_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'day': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(days=res) @classmethod @with_cache(ttl=24 * 3600) def get_yearly_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( month=1, day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(month=12, day=31, hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='year', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'year': interval['key_as_string'][:4], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(years=res) @classmethod @with_cache() def get_cost_by_resource(cls, keys, date_from=None, date_to=None, search=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if search: s = s.query('wildcard', resource_id='*{}*'.format(search)) agg = s.aggs.bucket('resources', 'terms', field='resource_id', order={'cost': 'desc'}, size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) resources = [{ 'resource': resource['key'], 'cost': resource['cost']['value'], } for resource in res['aggregations']['resources']['buckets']] return resources @classmethod def get_monthly_cost_by_resource(cls, resource_ids, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) if resource_ids: s = cls.search() s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('terms', resource_id=list(resource_ids)) agg = s.aggs.bucket('months', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.metric('cost', 'sum', field='cost') r = client.search('awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return { e['key_as_string']: e['cost']['value'] for e in r['aggregations']['months']['buckets'] } else: return {} @classmethod @with_cache() def get_lambda_usage(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='AWS Lambda') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('resources', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'avg', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') agg = agg.bucket('descriptions', 'terms', field='item_description', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) #return res def _lambda_usage_regb(buckets, endswith): for b in buckets: if b['key'].endswith(endswith): return b['quantity']['value'] usages = [{ 'rid': usage['key'], 'name': usage['key'].split(':')[-1], 'requests': _lambda_usage_regb(usage['types']['buckets'], '-Request'), 'gb_seconds': _lambda_usage_regb(usage['types']['buckets'], '-Lambda-GB-Second'), 'cost': usage['cost']['value'], 'raw_cost': lambdapricing.get_raw_cost([ x['descriptions']['buckets'] for x in usage['types']['buckets'] ]), } for usage in res['aggregations']['resources']['buckets']] return usages @classmethod @with_cache() def get_s3_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod @with_cache() def get_ec2_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod def get_ec2_daily_cost(cls, key): s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg.metric('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) for interval in res['aggregations']['intervals']['buckets']: yield interval['key_as_string'].split( 'T')[0], interval['cost']['value'] @classmethod @with_cache() def get_elb_usage_a_day(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) gib = Fraction(2**30) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("prefix", resource_id="arn:aws:elasticloadbalancing") s = s.sort({"usage_start_date": {"order": "desc"}}) agg = s.aggs.bucket('rid', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) elbs = [{ 'rid': elb['key'], 'cost': elb['cost']['value'] / (date_to - date_from).days, 'hours': float( sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('LoadBalancerUsage') ]) / (date_to - date_from).days), 'bytes': float((sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('Bytes') ]) * gib) / (date_to - date_from).days), } for elb in res['aggregations']['rid']['buckets']] return elbs @classmethod @with_cache() def get_instance_type(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.extra(_source=[ 'usage_start_date', 'usage_type', 'availability_zone', 'resource_id' ]) s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.query('wildcard', usage_type='*BoxUsage:*') s = s.filter('exists', field='resource_id') s = s.sort({"usage_start_date": {"order": "desc"}}) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=10000, request_timeout=60) def cut_region_name(s): return s[:-1] if s[-1].isalpha() else s types = [] refs = {} def add_in_types(type, rid): ref_tuple = (type['hour'], type['instance'], type['region']) if ref_tuple in refs: refs[ref_tuple]['rids'].append(rid) refs[ref_tuple]['ridCount'] += 1 return type['rids'] = [rid] types.append(type) refs[ref_tuple] = types[-1] for r in res['hits']['hits']: elem = { 'hour': r['_source']['usage_start_date'], 'instance': r['_source']['usage_type'].split(':')[1], 'region': cut_region_name(r['_source']['availability_zone']) if 'availability_zone' in r['_source'] else 'unknown', 'ridCount': 1, } add_in_types(elem, r['_source']['resource_id']) return types @classmethod @with_cache() def get_instance_hour(cls, keys, date_from=None, date_to=None, min_hour=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.filter('prefix', resource_id='i-') s = s.query('wildcard', usage_type='*BoxUsage*') agg = s.aggs.bucket('resource_id', 'terms', field='resource_id', size=0x7FFFFFFF) agg.bucket('days', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) instance_list = [] for instance in res['aggregations']['resource_id']['buckets']: tmp_hours = [] for day in instance['days']['buckets']: tmp_hours.append(day['doc_count']) avg_hours = sum(tmp_hours) / float(len(tmp_hours)) if not min_hour or avg_hours >= min_hour: instance_list.append(dict(id=instance['key'], hours=avg_hours)) return sorted(instance_list, key=lambda x: x['hours'], reverse=True) @classmethod @with_cache() def get_s3_buckets_per_tag(cls, keys): def _check_if_in_list(dict_list, value, key): return next((item for item in dict_list if item[key] == value), None) def _parse_tag_keys_results(res): bucket_tagged = [] for bucket_tag_key in res['aggregations']['tag_key']['buckets']: buff_tag_key = _check_if_in_list(bucket_tagged, bucket_tag_key['key'], 'tag_key') if buff_tag_key is None: buff_tag_key = { "tag_key": bucket_tag_key['key'], "tag_value": [] } buff_tag_key = _parse_tag_values_results( bucket_tag_key, buff_tag_key) bucket_tagged.append(buff_tag_key) return bucket_tagged def _parse_tag_values_results(bucket_tag_key, buff_tag_key): for bucket_tag_value in bucket_tag_key['tag_value']['buckets']: buff_tag_value = _check_if_in_list(buff_tag_key['tag_value'], bucket_tag_value['key'], 'tag_value') if buff_tag_value is None: buff_tag_value = { "tag_value": bucket_tag_value['key'], "s3_buckets": [] } buff_tag_value = _parse_buckets_results( buff_tag_value, bucket_tag_value) buff_tag_key['tag_value'].append(buff_tag_value) return buff_tag_key def _parse_buckets_results(buff_tag_value, bucket_tag_value): for bucket_resource_id in bucket_tag_value['ressource_id'][ 'buckets']: buff_bucket_resource_id = _check_if_in_list( buff_tag_value['s3_buckets'], bucket_resource_id['key'], 'bucket_name') if buff_bucket_resource_id is None: buff_bucket_resource_id = { "bucket_name": bucket_resource_id['key'], "account_id": bucket_resource_id['account_id']['buckets'][0]['key'] } buff_tag_value['s3_buckets'].append(buff_bucket_resource_id) return buff_tag_value s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.query('exists', field="tag") s = s.query('wildcard', item_description="*storage*") agg = s.aggs.bucket('tag_key', 'terms', field="tag.key") agg = agg.bucket('tag_value', 'terms', field='tag.value') agg.bucket('ressource_id', 'terms', field='resource_id').bucket('account_id', 'terms', field='linked_account_id') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) ''' bucket_tagged structure [{ "tag_key" : "KEY", # Unique in list "tag_value": [{ "tag_value": "VALUE", # Unique in list "s3_buckets": [{ "bucket_name": "BUCKET_NAME", "account_id": "ACCOUND_ID" }, {...}] }, {...}] }, {...}] ''' bucket_tagged = _parse_tag_keys_results(res) return bucket_tagged @classmethod @with_cache() def get_s3_bandwidth_info_and_cost_per_name(cls, key, bucket_resource_ids, date_from=None, date_to=None): date_from = date_from or (datetime.utcnow() - relativedelta( month=1)).replace(day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('terms', resource_id=bucket_resource_ids if isinstance( bucket_resource_ids, list) else [bucket_resource_ids]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('wildcard', usage_type="*Bytes") agg = s.aggs.bucket('bucket_name', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('transfer_type', 'terms', field='usage_type') agg.metric('data', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) data = [{ "bucket_name": bucket['key'], "cost": bucket['cost']['value'], "transfer_stats": [{ "type": transfer_stat['key'], "data": transfer_stat['data']['value'] } for transfer_stat in bucket['transfer_type']['buckets']] } for bucket in res['aggregations']['bucket_name']['buckets']] return data
class SentimentAnalysis(PluginBase): """Class implementing a sentiment analysis plugin. Wraps the TextBlob library.""" data_schema = {"text": es.Object(Text)} def __init__(self, *args, **kwargs) -> None: """Initialise SentimentAnalysis instance, ensure we have downloaded required data.""" # Ensure we have the various corpora that we need for analysis works. text_blob_download() nltk_download('vader_lexicon') # Initialise the Vader sentiment analysis tool self.analyser = SentimentIntensityAnalyzer() super().__init__(*args, **kwargs) # type: ignore def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]: """Attempt to analyse sentiment of the given tweet data.""" # pull either the tweet text, or the tweet full text depending on # whether the tweet was truncated if not tweet_json: return tweet_json raw_tweet: Dict[str, Any] = cast(Dict[str, Any], tweet_json.get('_raw')) if not raw_tweet: return tweet_json text_processing: Dict[str, Any] = {} text_processing['short_text'] = raw_tweet['text'] text_processing['truncated'] = raw_tweet['truncated'] if text_processing['truncated']: text_processing['full_text'] = raw_tweet['extended_tweet']['full_text'] tweet_text = ( text_processing['full_text'] if text_processing['truncated'] else text_processing['short_text'] ) text_processing.update( self._blob_process_tweet( tweet_text=tweet_text, tweet_lang=raw_tweet['lang'], ) ) # NLTK Vader sentiment analysis - translated field is created during # the text blob processing. text_processing.update(self._vader_classify(text_processing['translated'])) LOG.info( 'Polarity: %s, Subjectivity: %s, Word Count: %s, Language: %s', text_processing['pattern_polarity'], text_processing['pattern_subjectivity'], text_processing['tweet_length'], raw_tweet['lang'] ) tweet_json['text'] = text_processing return tweet_json @staticmethod def _blob_process_tweet(tweet_text: str, tweet_lang: str) -> Dict[str, Any]: """Analyse tweet text using the TextBlob class.""" text_processing: Dict[str, Union[str, int]] = {} blob = TextBlob(tweet_text) if not blob: return {} try: if tweet_lang not in ('en', 'und', None): LOG.debug('Attempting to translate from %s to English', tweet_lang) # We make use of the Tenacity retry library here to simplify # repeating a function call, however the default implementation # is a decorator, hence the pair of calls here, which return a # wrapped function which will deal with retries. retrying_translator = retry( stop=stop_after_attempt(5), wait=wait_fixed(0.5) )( blob.translate ) new_blob = retrying_translator(to='en') blob = new_blob except NotTranslated: pass except RetryError: LOG.error('Unable to translate tweet contents: %s', str(blob)) text_processing['translated'] = str(blob) LOG.debug( 'Analysing the following sentence for sentiment: %s', text_processing['translated'] ) # TextBlob based sentiment analysis, based on the Pattern Analyser, text_processing['pattern_polarity'] = blob.sentiment.polarity text_processing['pattern_subjectivity'] = blob.sentiment.subjectivity text_processing['tweet_length'] = len(blob.words) return text_processing def _vader_classify(self, tweet_text: str) -> Dict[str, Any]: """Analyse tweet text for sentiment using NLTK Vader Algorithm.""" # NLTK Vader sentiment analysis. text_processing: Dict[str, float] = {} sentiment_scores = self.analyser.polarity_scores(tweet_text) if any(sentiment_scores): # Make sure we have something to dump into the output data. # {'neg': 0.347, 'neu': 0.653, 'pos': 0.0, 'compound': -0.1511} text_processing['vader_negative'] = sentiment_scores['neg'] text_processing['vader_neutral'] = sentiment_scores['neu'] text_processing['vader_positive'] = sentiment_scores['pos'] text_processing['vader_compound'] = sentiment_scores['compound'] text_processing['vader_compound_inverted'] = sentiment_scores['compound'] * -1 return text_processing
class AWSStat(dsl.DocType): class Meta: index = 'awsstat' key = dsl.String(index='not_analyzed') time = dsl.Date(format='date_optional_time||epoch_millis') stat = dsl.String(index='not_analyzed') data = dsl.Object(enabled=False) @classmethod def latest_instance_stats(cls, key): s = cls.search() s = s.filter('term', key=key) s = s.filter('term', stat='instances').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=10, request_timeout=60) stats = [] for r in res['hits']['hits']: stat = r['_source']['data'] stat.update(time=r['_source']['time']) stats.append(stat) stats.sort(key=lambda s: s['time'], reverse=True) return dict(stats=stats) @classmethod def get_latest_instance_states(cls, key, instance_id, days=5): s = cls.search() s = s.filter('term', key=key) s = s.filter('term', stat='instancestate/' + instance_id).sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=days, request_timeout=60) states = [] for r in res['hits']['hits']: states.append( dict(time=r['_source']['time'], state=r['_source']['data']['state'])) return states @classmethod def latest_on_demand_to_reserved_suggestion(cls, keys): keys = any_key_to_string_array(keys) s = cls.search() s = s.filter('terms', key=keys) s = s.filter('term', stat='ondemandtoreserved').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0: return res['hits']['hits'][0]['_source']['data'] return dict(total=0) @classmethod def latest_s3_space_usage(cls, keys): keys = any_key_to_string_array(keys) s = cls.search() s = s.filter('terms', key=keys) s = s.filter('term', stat='s3spaceusage').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0: return res['hits']['hits'][0]['_source']['data'] return None @classmethod def latest_available_volumes(cls, keys): keys = any_key_to_string_array(keys) s = cls.search() s = s.filter('terms', key=keys) s = s.filter('term', stat='detachedvolumes').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0: return res['hits']['hits'][0]['_source']['data'] return dict(total=0) @classmethod def latest_hourly_cpu_usage_by_tag(cls, key): s = cls.search() s = s.filter('term', key=key) s = s.filter('term', stat='hourlycpubytag').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0 and 'data' in res['hits']['hits'][0][ '_source']: return res['hits']['hits'][0]['_source']['data'] return dict(tags=[]) @classmethod def latest_daily_cpu_usage_by_tag(cls, key): s = cls.search() s = s.filter('term', key=key) s = s.filter('term', stat='dailycpubytag').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0 and 'data' in res['hits']['hits'][0][ '_source']: return res['hits']['hits'][0]['_source']['data'] return dict(tags=[]) @classmethod def latest_stopped_instances_report(cls, keys): keys = any_key_to_string_array(keys) s = cls.search() s = s.filter('terms', key=keys) s = s.filter('term', stat='stoppedinstancesreport').sort('-time') res = client.search(index='awsstat', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] > 0: return res['hits']['hits'][0]['_source']['data'] return dict(total=0)
class PublicationDoc(DocType): all_data = edsl.Text() id = edsl.Integer() title = edsl.Text(copy_to=ALL_DATA_FIELD) date_published = edsl.Date() last_modified = edsl.Date() code_archive_url = edsl.Keyword() doi = edsl.Keyword() contact_email = edsl.Keyword(copy_to=ALL_DATA_FIELD) container = edsl.Object(ContainerInnerDoc) tags = edsl.Nested(RelatedInnerDoc) sponsors = edsl.Nested(RelatedInnerDoc) platforms = edsl.Nested(RelatedInnerDoc) model_documentation = edsl.Keyword() authors = edsl.Nested(AuthorInnerDoc) @classmethod def from_instance(cls, publication): container = publication.container doc = cls(meta={'id': publication.id}, id=publication.id, title=publication.title, date_published=publication.date_published, last_modified=publication.date_modified, code_archive_url=publication.code_archive_url, contact_email=publication.contact_email, container=ContainerInnerDoc(id=container.id, name=container.name, issn=container.issn), doi=publication.doi, tags=[ RelatedInnerDoc(id=t.id, name=t.name) for t in publication.tags.all() ], sponsors=[ RelatedInnerDoc(id=s.id, name=s.name) for s in publication.sponsors.all() ], platforms=[ RelatedInnerDoc(id=p.id, name=p.name) for p in publication.platforms.all() ], model_documentation=[ md.name for md in publication.model_documentation.all() ], authors=[ AuthorInnerDoc(id=a.id, name=a.name, orcid=a.orcid, researcherid=a.researcherid, email=a.email) for a in publication.creators.all() ]) return doc.to_dict(include_meta=True) def get_public_detail_url(self): return reverse('core:public-publication-detail', kwargs={'pk': self.meta.id}) @classmethod def get_breadcrumb_data(cls): return { 'breadcrumb_trail': [{ 'link': reverse('core:public-home'), 'text': 'Home' }, { 'text': 'Publications' }] } @classmethod def get_public_list_url(cls, search=None): location = reverse('core:public-search') if search: query_string = urlencode({'search': search}) location += '?{}'.format(query_string) return location class Index: name = 'publication' settings = {'number_of_shards': 1}