Exemplo n.º 1
0
class PlayersInGame(elasticsearch_dsl.InnerDoc):
    """Players in game search model"""

    north = elasticsearch_dsl.Object(Player)
    east = elasticsearch_dsl.Object(Player)
    south = elasticsearch_dsl.Object(Player)
    west = elasticsearch_dsl.Object(Player)
Exemplo n.º 2
0
class ClusterSource(es.Document):
    name = es.Keyword()
    clusters = es.Object()
    clustering_params = es.Object()

    class Index:
        name = ES_INDEX_SOURCE_CLUSTERS
        using = ES_CLIENT

        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        }

        mappings = {
            "properties": {
                "name": {
                    "type": "keyword",
                },
                "clusters": {
                    "type": "object",
                },
                "clustering_params": {
                    "type": "object",
                },
            },
        }
Exemplo n.º 3
0
def gendocu(stream, estype='document'):
    """
    Creates the mapping for type document in Elasticsearch
    :param estype: Name of ES type (defaults to 'document')
    """
    m = dsl.Mapping(estype)
    # Set properties
    m.properties.dynamic = 'strict'
    # Adding mapping
    m = gencontext(m)
    m = m.field('@id', 'string', index='not_analyzed')
    m = m.field('@type', 'string', index='no')
    m = m.field('dc:contributor',
                'string',
                index='analyzed',
                analyzer='autocomplete')
    access = dsl.Object()
    access = access.property('@type', 'string')
    access = access.property('@value', 'date', format='dateOptionalTime')
    m = m.field('dct:issued', access)
    m = m.field('dct:modified', access)
    m = m.field('foaf:primaryTopic',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    # Save the mapping in ES
    pprint(m.to_dict(), stream=stream)
Exemplo n.º 4
0
class TopicCombo(es.Document):
    topics = es.Object()
    common_docs_ids = es.Keyword()
    common_docs_num = es.Integer()

    class Index:
        name = ES_INDEX_TOPIC_COMBOS  # f"{ES_INDEX_TOPIC_COMBOS}_{tm}"
        using = ES_CLIENT

        settings = {
            "number_of_shards": 2,
            "number_of_replicas": 1,
            "max_result_window": 5000000,
        }
        mappings = {
            "properties": {
                "topics": {
                    "type": "object"
                },
                "common_docs_ids": {
                    "type": "keyword",
                },
                "common_docs_num": {
                    "type": "integer",
                },
            }
        }
Exemplo n.º 5
0
def test_aggregate_data_schema():
    """Verify the behaviour of the ingress.utils.aggregate_data_schema function."""
    class Base:  # noqa
        data_schema = {}

    class Sub1(Base):  # noqa
        data_schema = {'Sub1': True}

    class Sub2(Base):  # noqa
        data_schema = {'Sub2': True}

    class SubSub1(Sub2):  # noqa
        data_schema = {'SubSub1': True}

    class SubSub2(Sub2):  # noqa
        data_schema = {'SubSub2': True}

    aggregated_schema = iu.aggregate_data_schema(Base, include_defaults=True)

    assert aggregated_schema == {
        'Sub1': True,
        'Sub2': True,
        'SubSub1': True,
        'SubSub2': True,
        '_raw': es.Object(dynamic=True),
        'timestamp': es.Date()
    }
Exemplo n.º 6
0
class Activiteit(es.DocType):
    ext_id = es.String(index='not_analyzed')
    naam = es.String(analyzer=dutch_analyzer)
    beschrijving = es.String(analyzer=dutch_analyzer)
    bron_link = es.String(index='not_analyzed')
    tijdstip = es.String(index='not_analyzed')
    tags = es.String(index='not_analyzed')
    centroid = es.GeoPoint()
    locatie = es.Object(doc_class=Locatie,
                        properties={
                            'ext_id':
                            es.String(index='not_analyzed'),
                            'naam':
                            es.String(analyzer=dutch_analyzer),
                            'centroid':
                            es.GeoPoint(),
                            'openbare_ruimte_naam':
                            es.String(index='not_analyzed'),
                            'huisnummer':
                            es.String(index='not_analyzed'),
                            'huisnummer_toevoeging':
                            es.String(index='not_analyzed'),
                            'postcode':
                            es.String(index='not_analyzed')
                        })
Exemplo n.º 7
0
class Node(es.DocType):
    """
    Elastic document describing user
    """

    node_type = es.Keyword()

    objectID = es.Keyword()

    name = es.Text(
        fielddata=True,
        analyzer=autocomplete
    )

    user = es.Object(
        fields={
            'id': es.Keyword(),
            'name': es.Text(
                fielddata=True,
                analyzer=autocomplete)
        }
    )

    description = es.Text()

    is_free = es.Boolean()

    project = es.Object(
        fields={
            'id': es.Keyword(),
            'name': es.Keyword(),
            'url': es.Keyword(),
        }
    )

    media = es.Keyword()

    picture = es.Keyword()

    tags = es.Keyword(multi=True)
    license_notes = es.Text()

    created_at = es.Date()
    updated_at = es.Date()

    class Meta:
        index = 'nodes'
Exemplo n.º 8
0
class TestSearchDocument(BaseDocument):
    # pylint: disable=no-member
    name = dsl.String()
    num = dsl.Integer()
    json = dsl.Object()

    class Meta:
        index = 'test_search'
Exemplo n.º 9
0
def gencontext(mapobj):
    context = dsl.Object()
    namespaces = [
        'bibo', 'dbp', 'dc', 'dct', 'foaf', 'rdau', 'rdf', 'rdfs', 'skos',
        'xsd'
    ]
    for token in namespaces:
        context = context.property(token, 'string', index='no')
    return mapobj.field('@context', context)
Exemplo n.º 10
0
def deep_field_factory(field):
    if field.is_relation and (field.many_to_one or field.one_to_one):
        props = {}
        for f in field.related_model._meta.get_fields():
            nested_field = deep_field_factory(f)
            if nested_field is not None:
                props[f.name] = nested_field
        return dsl.Object(properties=props)
    else:
        return document_field(field)
Exemplo n.º 11
0
class GameSummary(elasticsearch_dsl.Document):
    """Game search model"""

    id = elasticsearch_dsl.Text()
    name = elasticsearch_dsl.Text()
    isPublic = elasticsearch_dsl.Boolean()
    players = elasticsearch_dsl.Object(PlayersInGame)

    class Index:  # pylint: disable=missing-class-docstring
        name = "games"
Exemplo n.º 12
0
class TestSearchDocument(BaseDocument):
    id = dsl.Integer()
    name = dsl.Text(fielddata=True)
    num = dsl.Integer()
    date = dsl.Date()
    json = dsl.Object()

    field_name = Name()
    field_process_type = ProcessType()
    none_test = dsl.Integer()

    class Index:
        name = "test_search"
Exemplo n.º 13
0
class TestSearchDocument(BaseDocument):
    # pylint: disable=no-member
    id = dsl.Integer()  # pylint: disable=invalid-name
    name = dsl.String()
    num = dsl.Integer()
    json = dsl.Object()

    field_name = Name()
    field_process_type = ProcessType()
    none_test = dsl.Integer()

    class Meta:
        index = 'test_search'
Exemplo n.º 14
0
class GeoCoding(PluginBase):
    """Class that will attempt to geotag a tweet."""

    data_schema = {
        'geotagged': es.Boolean(),
        'location': es.Object(Location),
        'coordinates': es.GeoPoint(),
    }

    def __init__(self, *args, **kwargs) -> None:
        """Setup Carmen geotagging options, then init super."""
        with warnings.catch_warnings():
            # The default setup of carmen appears to raise several warnings, we
            # suppress them with the catch_warnings context manager.
            warnings.simplefilter("ignore")
            resolver_options = {'place': {'allow_unknown_locations': True}}
            self.geotagger = get_resolver(options=resolver_options)
            self.geotagger.load_locations()
            self.location_resolver = LocationEncoder()

        super().__init__(*args, **kwargs)  # type: ignore

    def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]:
        """
        Attempt to geotag the tweet data.

        Returns the tweet with new data if any was resolved and will set
        geotagged according to success or failure.
        """
        LOG.debug('Attempting to geotag tweet')
        tweet_location = self.geotagger.resolve_tweet(tweet_json['_raw'])

        tweet_json['geotagged'] = False

        if tweet_location:
            LOG.debug('  This tweet includes location information')
            tweet_json['location'] = self.location_resolver.default(
                tweet_location[1])

            if 'latitude' in tweet_json[
                    'location'] and 'longitude' in tweet_json['location']:
                tweet_json['coordinates'] = {
                    'lat': tweet_json['location']['latitude'],
                    'lon': tweet_json['location']['longitude'],
                }

                tweet_json['geotagged'] = True
                LOG.debug('Geotagging completed!')

        return tweet_json
Exemplo n.º 15
0
class EmbeddingIndex(es.Document):
    corpus = es.Keyword()
    number_of_documents = es.Integer()
    is_ready = es.Boolean()
    name = es.Keyword()
    description = es.Text()
    datetime_created = es.Date()
    datetime_finished = es.Date()

    by_unit = es.Keyword()  # Token/Word/Sentence/Text
    algorithm = es.Keyword()
    pooling = es.Keyword()
    meta_parameters = es.Object()

    class Index:
        name = ES_INDEX_EMBEDDING
        using = ES_CLIENT
Exemplo n.º 16
0
def aggregate_data_schema(
    base_class: Type,
    include_defaults: bool = True,
) -> Dict[str, Any]:
    """Iterate through imported plugins and create an ingress mapping to process the data with."""
    mapping: Dict = {}
    for subclass in find_subclasses(base_class):
        subclass_data_schema = None
        try:
            subclass_data_schema = getattr(subclass, 'data_schema')
        except AttributeError:
            continue
        if subclass_data_schema:
            mapping.update(subclass_data_schema)

    if include_defaults:
        mapping['_raw'] = es.Object(dynamic=True)
        mapping['timestamp'] = es.Date()

    return mapping
Exemplo n.º 17
0
class TopicModellingIndex(es.Document):
    corpus = es.Keyword()
    source = es.Keyword()
    number_of_documents = es.Integer()
    is_ready = es.Boolean()
    has_topic_info = es.Boolean()
    name = es.Keyword()
    description = es.Text()
    datetime_created = es.Date()
    datetime_finished = es.Date()

    datetime_from = es.Date()
    datetime_to = es.Date()

    algorithm = es.Keyword()
    number_of_topics = es.Integer()
    hierarchical = es.Boolean()
    meta_parameters = es.Object()

    perplexity = es.Float()
    purity = es.Float()
    contrast = es.Float()
    coherence = es.Float()

    tau_smooth_sparse_theta = es.Float()
    tau_smooth_sparse_phi = es.Float()
    tau_decorrelator_phi = es.Float()
    tau_coherence_phi = es.Float()

    topics = es.Nested(Topic)

    is_actualizable = es.Boolean()

    class Index:
        name = ES_INDEX_TOPIC_MODELLING
        using = ES_CLIENT
Exemplo n.º 18
0
class Customer(ArchivingDocType):
    """Model a customer."""

    name = dsl.Keyword()
    permissions = dsl.Object()
    cycles = dsl.Object()

    class Meta:
        index = auth_index._name

    @classmethod
    def get_by_name(cls, name):
        # type: (str, dsl.Index) -> reles.auth.models.Customer
        """Get the first customer with the given name."""
        response = cls.search(index=auth_index._name).filter(
            'term',
            name=name
        ).execute()

        if response.hits.total == 0:
            raise NotFoundError(
                'There is no customer with name \'{}\''.format(name)
            )
        elif response.hits.total == 1:
            return response[0]
        else:
            raise ConflictError(
                'Inconsistent data detected: there are {} customers with name'
                ' \'{}\': {}'.format(
                    response.hits.total,
                    name,
                    [user.meta.id for user in response.hits],
                )
            )

    @classmethod
    def charge_cycles(cls, customer_id, target, cycles):
        es = dsl.connections.connections.get_connection(cls._doc_type.using)

        try:
            return es.update(
                index=cls._doc_type.index,
                doc_type=cls._doc_type.name,
                id=customer_id,
                body={
                    'script': {
                        'file': 'charge_cycles',
                        'lang': 'groovy',
                        'params': {
                            'index': target,
                            'cycles': cycles,
                        }
                    }
                }
            )
        except elasticsearch.NotFoundError:
            app.logger.debug(
                'Failed to charge non-existent customer `%s` with %d cycles for'
                ' index `%s`', customer_id, cycles, target
            )
            raise NotFoundError('Invalid customer')

    def add_permissions(self, permissions):
        """Add the given permissions."""
        for index, added in permissions.items():
            self.permissions[index] = list(
                set(self.permissions.to_dict().get(index, [])).union(added)
            )

    def remove_permissions(self, permissions):
        """Remove the given permissions."""
        for index, removed in permissions.items():
            self.permissions[index] = list(
                set(self.permissions.to_dict().get(index, [])).difference(removed)
            )

    def save(self, using=None, index=None, validate=True, **kwargs):
        """Save a customer instance."""
        self._update_aliases()
        super(Customer, self).save(using, index, validate, **kwargs)

    def _update_aliases(self):
        alias_actions = []

        # delete aliases of removed permissions
        #  this encodes knowledge about how aliases are formatted
        affected_indexes = app.es.indices.get_alias('*_%s_*' % self.name)
        for index in affected_indexes:
            permissions_by_index = self.permissions.to_dict().get(index, [])

            for alias in affected_indexes[index]['aliases']:
                _, customer, permission = unalias(alias)
                if customer == self.name and permission not in permissions_by_index:
                    alias_actions.append(
                        self._build_alias_action('remove', index, permission)
                    )

        # create aliases for added permissions
        for index in self.permissions:
            for permission in self.permissions[index]:
                if not app.es.indices.exists_alias(name=get_alias(index, self.name, permission)):
                    alias_actions.append(
                        self._build_alias_action('add', index, permission)
                    )

        if alias_actions:
            app.es.indices.update_aliases(body={'actions': alias_actions})

    def _build_alias_action(self, action, index, permission):
        return {
            action: {
                'index': index,
                'alias': get_alias(index, self.name, permission)
            }
        }

    def refresh(self):
        """Sync the instance with the ES."""
        self.__dict__.update(self.get(self.meta.id).__dict__)
Exemplo n.º 19
0
def genbibres(stream, estype='bibliographicResource'):
    """
    Creates the mapping for type bibliographicResource in Elasticsearch
    :param estype: Name of ES type (defaults to 'bibliographicResource')
    """
    m = dsl.Mapping(estype)
    # Set properties
    m.properties.dynamic = 'strict'
    # Adding mapping
    m = gencontext(m)
    m = m.field('@id', 'string', index='not_analyzed')
    m = m.field('@type', 'string', index='no')
    m = m.field('bibo:edition', 'string', index='analyzed')
    m = m.field('bibo:isbn10', 'string', index='not_analyzed')
    m = m.field('bibo:isbn13', 'string', index='not_analyzed')
    m = m.field('bibo:issn', 'string', index='not_analyzed')
    m = m.field('dbp:originalLanguage',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    contrib = dsl.Nested()
    contrib = contrib.property('@id', dsl.String(index='no'))
    contrib = contrib.property('@type', dsl.String(index='no'))
    contrib = contrib.property('dbp:birthYear',
                               dsl.String(index='not_analyzed'))
    contrib = contrib.property('dbp:deathYear',
                               dsl.String(index='not_analyzed'))
    contrib = contrib.property('foaf:firstName', dsl.String(index='analyzed'))
    contrib = contrib.property('foaf:lastName', dsl.String(index='analyzed'))
    contrib = contrib.property('foaf:name', dsl.String(index='analyzed'))
    contrib = contrib.property('rdfs:label', dsl.String(index='analyzed'))
    contrib = contrib.property('skos:note', dsl.String(index='analyzed'))
    m = m.field('dc:contributor', contrib)
    m = m.field('dc:format', 'string', index='analyzed')
    m = m.field('dct:alternative',
                'string',
                index='analyzed',
                fields={'folded': dsl.String(analyzer='text_folded')})
    m = m.field('dct:bibliographicCitation',
                'string',
                index='analyzed',
                analyzer='standard')
    m = m.field('dct:hasPart', 'string', index='analyzed')
    m = m.field('dct:isPartOf',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:issued', 'string', index='analyzed')
    m = m.field('dct:language',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:subject',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('dct:title',
                'string',
                index='analyzed',
                fields={'folded': dsl.String(analyzer='text_folded')})
    m = m.field('rdau:contentType',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:dissertationOrThesisInformation',
                'string',
                index='analyzed')
    m = m.field('rdau:mediaType',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:modeOfIssuance',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:noteOnResource', 'string', index='not_analyzed')
    m = m.field('rdau:placeOfPublication',
                dsl.Object().property('@id', 'string', index='not_analyzed'))
    m = m.field('rdau:publicationStatement', 'string', index='analyzed')
    m = m.field(
        'rdfs:isDefinedBy',
        dsl.Object().property('@id',
                              'string',
                              index='analyzed',
                              analyzer='extr_id'))
    # Save the mapping in ES
    pprint(m.to_dict(), stream=stream)
    if options.dry_run:
        for hit in document_generator:
            logging.info(hit)
        success = True
    else:
        success, _ = bulk(es, gen, max_retries=20, initial_backoff=2, max_backoff=3600)
    return success


machine_ad = edsl.Mapping.from_es(
    doc_type="machine_ad", index=options.indexname, using=es
)
if not "claims" in machine_ad or not "failed" in machine_ad.to_dict()['machine_ad']['properties']['claims']['properties']:
    machine_ad.field(
        "jobs",
        edsl.Object(properties={status: edsl.Text(multi=True) for status in STATUSES}),
    )
    machine_ad.field(
        "claims",
        edsl.Object(
            properties={
                status: edsl.Object(
                    properties={resource: edsl.Float() for resource in RESOURCES}
                )
                for status in STATUSES
            }
        ),
    )
    machine_ad.field(
        "occupancy",
        edsl.Object(
Exemplo n.º 21
0
class AWSDetailedLineitem(dsl.DocType):
    class Meta:
        index = 'awsdetailedlineitem'

    availability_zone = dsl.String(index='not_analyzed')
    cost = dsl.Double()
    un_blended_cost = dsl.Double()
    item_description = dsl.String(index='not_analyzed')
    linked_account_id = dsl.String(index='not_analyzed')
    operation = dsl.String()
    payer_account_id = dsl.String(index='not_analyzed')
    pricing_plan_id = dsl.Long()
    product_name = dsl.String(index='not_analyzed')
    rate = dsl.Double()
    un_blended_rate = dsl.Double()
    rate_id = dsl.Long()
    record_id = dsl.String(index='not_analyzed')
    reserved_instance = dsl.Boolean()
    resource_id = dsl.String(index='not_analyzed')
    subscription_id = dsl.Long()
    tag = dsl.Object(
        properties={
            'key': dsl.String(index='not_analyzed'),
            'value': dsl.String(index='not_analyzed')
        })
    usage_end_date = dsl.Date(format='strict_date_optional_time||epoch_millis')
    usage_quantity = dsl.Double()
    usage_start_date = dsl.Date(
        format='strict_date_optional_time||epoch_millis')
    usage_type = dsl.String(index='not_analyzed')

    @classmethod
    @with_cache(ttl=3600 * 3, worker_refresh=True)
    def keys_has_data(cls, keys, date_from=None, date_to=None):
        date_to = date_to or datetime.utcnow()
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if date_from:
            s = s.filter('range',
                         usage_start_date={
                             'from': date_from.isoformat(),
                             'to': date_to.isoformat()
                         })
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        return res['hits']['total'] > 0

    @classmethod
    @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d"))
    def get_first_date(cls, keys):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.sort('usage_start_date')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)
        if res['hits']['total'] == 0:
            return
        return res['hits']['hits'][0]['_source']['usage_start_date'].split(
            'T')[0]

    @classmethod
    @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d"))
    def get_last_date(cls, keys, limit=None):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if limit:
            s = s.filter('range', usage_start_date={'to': limit.isoformat()})
        s = s.sort('-usage_start_date')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)
        if res['hits']['total'] == 0:
            return
        return res['hits']['hits'][0]['_source']['usage_start_date'].split(
            'T')[0]

    @classmethod
    def get_first_to_now_date(cls, keys):
        def from_date_to_today(d):
            now = datetime.utcnow()
            while d < now:
                yield d
                d += relativedelta(months=1)

        return list(from_date_to_today(cls.get_first_date(keys)))

    @classmethod
    def get_first_to_last_date(cls, keys):
        def from_date_to_last(d):
            last = cls.get_last_date(keys)
            while d < last:
                yield d
                d += relativedelta(months=1)

        return list(from_date_to_last(cls.get_first_date(keys)))

    @classmethod
    @with_cache(6 * 3600)
    def get_available_tags(cls, keys, only_with_data=None, product_name=None):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if product_name:
            s = s.filter('term', product_name=product_name)
        s.aggs.bucket('tag_key', 'terms', field='tag.key')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        tags = []
        for tag in res['aggregations']['tag_key']['buckets']:
            if tag['key'].startswith('user:'******'key'].split(':')[1]
                if not only_with_data or name in AWSStat.latest_hourly_cpu_usage_by_tag(
                        only_with_data
                )['tags'] or name in AWSStat.latest_daily_cpu_usage_by_tag(
                        only_with_data)['tags']:
                    tags.append(name)
        tags.sort()
        return dict(tags=tags)

    @classmethod
    @with_cache(ttl=6 * 3600)
    def get_cost_by_tag(cls, keys, tag, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)})
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s.aggs.bucket('total_cost', 'sum', field='cost')
        agg = s.aggs.bucket('tag_value',
                            'terms',
                            field='tag.value',
                            size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        tags = [{
            'tag_value': tag['key'],
            'cost': tag['cost']['value'],
        } for tag in res['aggregations']['tag_value']['buckets']]
        return dict(tags=tags,
                    total_cost=res['aggregations']['total_cost']['value'])

    @classmethod
    @with_cache(ttl=6 * 3600)
    def get_cost(cls, keys, date_from, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(
            hour=23, minute=59, second=59, microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s.aggs.bucket('total_cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        return dict(total_cost=res['aggregations']['total_cost']['value'])

    @classmethod
    @with_cache()
    def get_monthly_cost_by_tag(cls, keys, tag, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)})
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg.bucket('total_cost', 'sum', field='cost')
        agg = agg.bucket('tag_value',
                         'terms',
                         field='tag.value',
                         size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        months = [{
            'month':
            interval['key_as_string'].split('T')[0][:-3],
            'tags': [{
                'tag_value': tag['key'],
                'cost': tag['cost']['value'],
            } for tag in interval['tag_value']['buckets']],
            'total_cost':
            interval['total_cost']['value'],
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=months)

    @classmethod
    @with_cache()
    def get_cost_by_product(cls,
                            key,
                            date_from=None,
                            date_to=None,
                            without_discount=False,
                            only_discount=False,
                            size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        if without_discount:
            s = s.query(
                'bool',
                filter=[
                    ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500')
                ])
        if only_discount:
            s = s.filter('term', item_description='PAR_APN_ProgramFee_2500')
        agg = s.aggs.bucket('products',
                            'terms',
                            field='product_name',
                            order={'cost': 'desc'},
                            size=size)
        agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        products = [{
            'product': SHORT_NAMES.get(product['key'], product['key']),
            'cost': product['cost']['value'],
        } for product in res['aggregations']['products']['buckets']]
        return dict(products=products)

    @classmethod
    @with_cache()
    def get_cost_by_region(cls,
                           keys,
                           tagged=False,
                           byaccount=False,
                           date_from=None,
                           date_to=None,
                           size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })

        agg = s.aggs
        if byaccount:
            agg = agg.bucket('accounts', 'terms', field='linked_account_id')
        agg = agg.bucket('intervals',
                         'date_histogram',
                         field='usage_start_date',
                         interval='month',
                         min_doc_count=1)
        agg = agg.bucket('regions',
                         'terms',
                         field='availability_zone',
                         size=size)
        agg.bucket('cost', 'sum', field='cost')
        if tagged:
            agg = agg.bucket('tags', 'terms', field='tag.value')
            agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0)

        return res['aggregations']

    @classmethod
    @with_cache()
    def get_monthly_cost(cls,
                         keys,
                         date_from=None,
                         date_to=None,
                         size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'month': interval['key_as_string'].split('T')[0],
            'total_cost': interval['cost']['value'],
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=res)

    @classmethod
    @with_cache()
    def get_monthly_cost_by_product(cls,
                                    keys,
                                    tagged=False,
                                    date_from=None,
                                    date_to=None,
                                    size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.bucket('cost', 'sum', field='cost')
        if tagged:
            agg = agg.bucket('tags', 'terms', field='tag.value')
            agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        def tagged_cost(bucket, total):
            total_tag = 0.0
            for tag in bucket:
                total_tag += tag['cost']['value']
                yield (tag['key'], tag['cost']['value'])
            if total != total_tag:
                yield ('untagged', total - total_tag)

        res = [{
            'month':
            interval['key_as_string'].split('T')[0],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
                'tags': [{
                    'name': tag[0],
                    'cost': tag[1],
                } for tag in tagged_cost(product['tags']['buckets'],
                                         product['cost']['value'])],
            } for product in interval['products']['buckets']] if tagged else [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=res)

    @classmethod
    @with_cache(ttl=4 * 3600)
    def get_daily_cost_by_product(cls,
                                  keys,
                                  date_from=None,
                                  date_to=None,
                                  size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(
            hour=23, minute=59, second=59, microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='day',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.metric('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'day':
            interval['key_as_string'].split('T')[0],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(days=res)

    @classmethod
    @with_cache(ttl=24 * 3600)
    def get_yearly_cost_by_product(cls,
                                   keys,
                                   date_from=None,
                                   date_to=None,
                                   size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(month=12,
                                               day=31,
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='year',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.metric('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'year':
            interval['key_as_string'][:4],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(years=res)

    @classmethod
    @with_cache()
    def get_cost_by_resource(cls,
                             keys,
                             date_from=None,
                             date_to=None,
                             search=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        if search:
            s = s.query('wildcard', resource_id='*{}*'.format(search))
        agg = s.aggs.bucket('resources',
                            'terms',
                            field='resource_id',
                            order={'cost': 'desc'},
                            size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        resources = [{
            'resource': resource['key'],
            'cost': resource['cost']['value'],
        } for resource in res['aggregations']['resources']['buckets']]
        return resources

    @classmethod
    def get_monthly_cost_by_resource(cls,
                                     resource_ids,
                                     date_from=None,
                                     date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        if resource_ids:
            s = cls.search()
            s = s.filter('range',
                         usage_start_date={
                             'from': date_from.isoformat(),
                             'to': date_to.isoformat()
                         })
            s = s.filter('terms', resource_id=list(resource_ids))
            agg = s.aggs.bucket('months',
                                'date_histogram',
                                field='usage_start_date',
                                interval='month',
                                min_doc_count=1)
            agg.metric('cost', 'sum', field='cost')
            r = client.search('awsdetailedlineitem',
                              body=s.to_dict(),
                              size=0,
                              request_timeout=60)
            return {
                e['key_as_string']: e['cost']['value']
                for e in r['aggregations']['months']['buckets']
            }
        else:
            return {}

    @classmethod
    @with_cache()
    def get_lambda_usage(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', product_name='AWS Lambda')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('resources',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'avg', field='cost')
        agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        agg = agg.bucket('descriptions',
                         'terms',
                         field='item_description',
                         size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        #return res

        def _lambda_usage_regb(buckets, endswith):
            for b in buckets:
                if b['key'].endswith(endswith):
                    return b['quantity']['value']

        usages = [{
            'rid':
            usage['key'],
            'name':
            usage['key'].split(':')[-1],
            'requests':
            _lambda_usage_regb(usage['types']['buckets'], '-Request'),
            'gb_seconds':
            _lambda_usage_regb(usage['types']['buckets'], '-Lambda-GB-Second'),
            'cost':
            usage['cost']['value'],
            'raw_cost':
            lambdapricing.get_raw_cost([
                x['descriptions']['buckets'] for x in usage['types']['buckets']
            ]),
        } for usage in res['aggregations']['resources']['buckets']]
        return usages

    @classmethod
    @with_cache()
    def get_s3_bandwidth_costs(cls, key, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('types',
                            'terms',
                            field='usage_type',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg.metric('gb', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        transfers = [{
            'type': transfer['key'],
            'quantity': transfer['gb']['value'],
            'cost': transfer['cost']['value'],
        } for transfer in res['aggregations']['types']['buckets']]
        return transfers

    @classmethod
    @with_cache()
    def get_ec2_bandwidth_costs(cls, key, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Elastic Compute Cloud')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('types',
                            'terms',
                            field='usage_type',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg.metric('gb', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        transfers = [{
            'type': transfer['key'],
            'quantity': transfer['gb']['value'],
            'cost': transfer['cost']['value'],
        } for transfer in res['aggregations']['types']['buckets']]
        return transfers

    @classmethod
    def get_ec2_daily_cost(cls, key):
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Elastic Compute Cloud')

        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='day',
                            min_doc_count=1)
        agg.metric('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        for interval in res['aggregations']['intervals']['buckets']:
            yield interval['key_as_string'].split(
                'T')[0], interval['cost']['value']

    @classmethod
    @with_cache()
    def get_elb_usage_a_day(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        gib = Fraction(2**30)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("prefix", resource_id="arn:aws:elasticloadbalancing")
        s = s.sort({"usage_start_date": {"order": "desc"}})
        agg = s.aggs.bucket('rid',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        elbs = [{
            'rid':
            elb['key'],
            'cost':
            elb['cost']['value'] / (date_to - date_from).days,
            'hours':
            float(
                sum([
                    x['quantity']['value'] for x in elb['types']['buckets']
                    if x['key'].endswith('LoadBalancerUsage')
                ]) / (date_to - date_from).days),
            'bytes':
            float((sum([
                x['quantity']['value']
                for x in elb['types']['buckets'] if x['key'].endswith('Bytes')
            ]) * gib) / (date_to - date_from).days),
        } for elb in res['aggregations']['rid']['buckets']]
        return elbs

    @classmethod
    @with_cache()
    def get_instance_type(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.extra(_source=[
            'usage_start_date', 'usage_type', 'availability_zone',
            'resource_id'
        ])
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("term", product_name='Amazon Elastic Compute Cloud')
        s = s.query('wildcard', usage_type='*BoxUsage:*')
        s = s.filter('exists', field='resource_id')
        s = s.sort({"usage_start_date": {"order": "desc"}})
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=10000,
                            request_timeout=60)

        def cut_region_name(s):
            return s[:-1] if s[-1].isalpha() else s

        types = []
        refs = {}

        def add_in_types(type, rid):
            ref_tuple = (type['hour'], type['instance'], type['region'])
            if ref_tuple in refs:
                refs[ref_tuple]['rids'].append(rid)
                refs[ref_tuple]['ridCount'] += 1
                return
            type['rids'] = [rid]
            types.append(type)
            refs[ref_tuple] = types[-1]

        for r in res['hits']['hits']:
            elem = {
                'hour':
                r['_source']['usage_start_date'],
                'instance':
                r['_source']['usage_type'].split(':')[1],
                'region':
                cut_region_name(r['_source']['availability_zone'])
                if 'availability_zone' in r['_source'] else 'unknown',
                'ridCount':
                1,
            }
            add_in_types(elem, r['_source']['resource_id'])
        return types

    @classmethod
    @with_cache()
    def get_instance_hour(cls,
                          keys,
                          date_from=None,
                          date_to=None,
                          min_hour=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("term", product_name='Amazon Elastic Compute Cloud')
        s = s.filter('prefix', resource_id='i-')
        s = s.query('wildcard', usage_type='*BoxUsage*')
        agg = s.aggs.bucket('resource_id',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.bucket('days',
                   'date_histogram',
                   field='usage_start_date',
                   interval='day',
                   min_doc_count=1)
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        instance_list = []
        for instance in res['aggregations']['resource_id']['buckets']:
            tmp_hours = []
            for day in instance['days']['buckets']:
                tmp_hours.append(day['doc_count'])
            avg_hours = sum(tmp_hours) / float(len(tmp_hours))
            if not min_hour or avg_hours >= min_hour:
                instance_list.append(dict(id=instance['key'], hours=avg_hours))
        return sorted(instance_list, key=lambda x: x['hours'], reverse=True)

    @classmethod
    @with_cache()
    def get_s3_buckets_per_tag(cls, keys):
        def _check_if_in_list(dict_list, value, key):
            return next((item for item in dict_list if item[key] == value),
                        None)

        def _parse_tag_keys_results(res):
            bucket_tagged = []
            for bucket_tag_key in res['aggregations']['tag_key']['buckets']:
                buff_tag_key = _check_if_in_list(bucket_tagged,
                                                 bucket_tag_key['key'],
                                                 'tag_key')
                if buff_tag_key is None:
                    buff_tag_key = {
                        "tag_key": bucket_tag_key['key'],
                        "tag_value": []
                    }
                buff_tag_key = _parse_tag_values_results(
                    bucket_tag_key, buff_tag_key)
                bucket_tagged.append(buff_tag_key)
            return bucket_tagged

        def _parse_tag_values_results(bucket_tag_key, buff_tag_key):
            for bucket_tag_value in bucket_tag_key['tag_value']['buckets']:
                buff_tag_value = _check_if_in_list(buff_tag_key['tag_value'],
                                                   bucket_tag_value['key'],
                                                   'tag_value')
                if buff_tag_value is None:
                    buff_tag_value = {
                        "tag_value": bucket_tag_value['key'],
                        "s3_buckets": []
                    }
                buff_tag_value = _parse_buckets_results(
                    buff_tag_value, bucket_tag_value)
                buff_tag_key['tag_value'].append(buff_tag_value)
            return buff_tag_key

        def _parse_buckets_results(buff_tag_value, bucket_tag_value):
            for bucket_resource_id in bucket_tag_value['ressource_id'][
                    'buckets']:
                buff_bucket_resource_id = _check_if_in_list(
                    buff_tag_value['s3_buckets'], bucket_resource_id['key'],
                    'bucket_name')
                if buff_bucket_resource_id is None:
                    buff_bucket_resource_id = {
                        "bucket_name":
                        bucket_resource_id['key'],
                        "account_id":
                        bucket_resource_id['account_id']['buckets'][0]['key']
                    }
                buff_tag_value['s3_buckets'].append(buff_bucket_resource_id)
            return buff_tag_value

        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.query('exists', field="tag")
        s = s.query('wildcard', item_description="*storage*")
        agg = s.aggs.bucket('tag_key', 'terms', field="tag.key")
        agg = agg.bucket('tag_value', 'terms', field='tag.value')
        agg.bucket('ressource_id', 'terms',
                   field='resource_id').bucket('account_id',
                                               'terms',
                                               field='linked_account_id')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        '''
        bucket_tagged structure
        [{
            "tag_key" : "KEY", # Unique in list
            "tag_value": [{
                "tag_value": "VALUE", # Unique in list
                "s3_buckets": [{
                    "bucket_name": "BUCKET_NAME",
                    "account_id": "ACCOUND_ID"
                }, {...}]
            }, {...}]
        }, {...}]
        '''

        bucket_tagged = _parse_tag_keys_results(res)
        return bucket_tagged

    @classmethod
    @with_cache()
    def get_s3_bandwidth_info_and_cost_per_name(cls,
                                                key,
                                                bucket_resource_ids,
                                                date_from=None,
                                                date_to=None):
        date_from = date_from or (datetime.utcnow() - relativedelta(
            month=1)).replace(day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.filter('terms',
                     resource_id=bucket_resource_ids if isinstance(
                         bucket_resource_ids, list) else [bucket_resource_ids])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter('wildcard', usage_type="*Bytes")
        agg = s.aggs.bucket('bucket_name',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg = agg.bucket('transfer_type', 'terms', field='usage_type')
        agg.metric('data', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        data = [{
            "bucket_name":
            bucket['key'],
            "cost":
            bucket['cost']['value'],
            "transfer_stats": [{
                "type": transfer_stat['key'],
                "data": transfer_stat['data']['value']
            } for transfer_stat in bucket['transfer_type']['buckets']]
        } for bucket in res['aggregations']['bucket_name']['buckets']]
        return data
Exemplo n.º 22
0
class SentimentAnalysis(PluginBase):
    """Class implementing a sentiment analysis plugin.  Wraps the TextBlob library."""

    data_schema = {"text": es.Object(Text)}

    def __init__(self, *args, **kwargs) -> None:
        """Initialise SentimentAnalysis instance, ensure we have downloaded required data."""
        # Ensure we have the various corpora that we need for analysis works.
        text_blob_download()
        nltk_download('vader_lexicon')

        # Initialise the Vader sentiment analysis tool
        self.analyser = SentimentIntensityAnalyzer()

        super().__init__(*args, **kwargs)  # type: ignore

    def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]:
        """Attempt to analyse sentiment of the given tweet data."""
        # pull either the tweet text, or the tweet full text depending on
        # whether the tweet was truncated
        if not tweet_json:
            return tweet_json

        raw_tweet: Dict[str, Any] = cast(Dict[str, Any], tweet_json.get('_raw'))
        if not raw_tweet:
            return tweet_json

        text_processing: Dict[str, Any] = {}
        text_processing['short_text'] = raw_tweet['text']
        text_processing['truncated'] = raw_tweet['truncated']
        if text_processing['truncated']:
            text_processing['full_text'] = raw_tweet['extended_tweet']['full_text']

        tweet_text = (
            text_processing['full_text']
            if text_processing['truncated'] else text_processing['short_text']
        )

        text_processing.update(
            self._blob_process_tweet(
                tweet_text=tweet_text,
                tweet_lang=raw_tweet['lang'],
            )
        )

        # NLTK Vader sentiment analysis - translated field is created during
        # the text blob processing.
        text_processing.update(self._vader_classify(text_processing['translated']))

        LOG.info(
            'Polarity: %s, Subjectivity: %s, Word Count: %s, Language: %s',
            text_processing['pattern_polarity'],
            text_processing['pattern_subjectivity'],
            text_processing['tweet_length'],
            raw_tweet['lang']
        )

        tweet_json['text'] = text_processing

        return tweet_json

    @staticmethod
    def _blob_process_tweet(tweet_text: str, tweet_lang: str) -> Dict[str, Any]:
        """Analyse tweet text using the TextBlob class."""
        text_processing: Dict[str, Union[str, int]] = {}

        blob = TextBlob(tweet_text)
        if not blob:
            return {}

        try:
            if tweet_lang not in ('en', 'und', None):
                LOG.debug('Attempting to translate from %s to English', tweet_lang)
                # We make use of the Tenacity retry library here to simplify
                # repeating a function call, however the default implementation
                # is a decorator, hence the pair of calls here, which return a
                # wrapped function which will deal with retries.
                retrying_translator = retry(
                    stop=stop_after_attempt(5),
                    wait=wait_fixed(0.5)
                )(
                    blob.translate
                )
                new_blob = retrying_translator(to='en')
                blob = new_blob
        except NotTranslated:
            pass
        except RetryError:
            LOG.error('Unable to translate tweet contents: %s', str(blob))

        text_processing['translated'] = str(blob)

        LOG.debug(
            'Analysing the following sentence for sentiment: %s',
            text_processing['translated']
        )

        # TextBlob based sentiment analysis, based on the Pattern Analyser,
        text_processing['pattern_polarity'] = blob.sentiment.polarity
        text_processing['pattern_subjectivity'] = blob.sentiment.subjectivity

        text_processing['tweet_length'] = len(blob.words)

        return text_processing

    def _vader_classify(self, tweet_text: str) -> Dict[str, Any]:
        """Analyse tweet text for sentiment using NLTK Vader Algorithm."""
        # NLTK Vader sentiment analysis.
        text_processing: Dict[str, float] = {}
        sentiment_scores = self.analyser.polarity_scores(tweet_text)
        if any(sentiment_scores):
            # Make sure we have something to dump into the output data.
            # {'neg': 0.347, 'neu': 0.653, 'pos': 0.0, 'compound': -0.1511}
            text_processing['vader_negative'] = sentiment_scores['neg']
            text_processing['vader_neutral'] = sentiment_scores['neu']
            text_processing['vader_positive'] = sentiment_scores['pos']
            text_processing['vader_compound'] = sentiment_scores['compound']
            text_processing['vader_compound_inverted'] = sentiment_scores['compound'] * -1

        return text_processing
Exemplo n.º 23
0
class AWSStat(dsl.DocType):
    class Meta:
        index = 'awsstat'

    key = dsl.String(index='not_analyzed')
    time = dsl.Date(format='date_optional_time||epoch_millis')
    stat = dsl.String(index='not_analyzed')
    data = dsl.Object(enabled=False)

    @classmethod
    def latest_instance_stats(cls, key):
        s = cls.search()
        s = s.filter('term', key=key)
        s = s.filter('term', stat='instances').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=10,
                            request_timeout=60)
        stats = []
        for r in res['hits']['hits']:
            stat = r['_source']['data']
            stat.update(time=r['_source']['time'])
            stats.append(stat)
        stats.sort(key=lambda s: s['time'], reverse=True)
        return dict(stats=stats)

    @classmethod
    def get_latest_instance_states(cls, key, instance_id, days=5):
        s = cls.search()
        s = s.filter('term', key=key)
        s = s.filter('term', stat='instancestate/' + instance_id).sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=days,
                            request_timeout=60)

        states = []
        for r in res['hits']['hits']:
            states.append(
                dict(time=r['_source']['time'],
                     state=r['_source']['data']['state']))
        return states

    @classmethod
    def latest_on_demand_to_reserved_suggestion(cls, keys):
        keys = any_key_to_string_array(keys)
        s = cls.search()
        s = s.filter('terms', key=keys)
        s = s.filter('term', stat='ondemandtoreserved').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0:
            return res['hits']['hits'][0]['_source']['data']
        return dict(total=0)

    @classmethod
    def latest_s3_space_usage(cls, keys):
        keys = any_key_to_string_array(keys)
        s = cls.search()
        s = s.filter('terms', key=keys)
        s = s.filter('term', stat='s3spaceusage').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0:
            return res['hits']['hits'][0]['_source']['data']
        return None

    @classmethod
    def latest_available_volumes(cls, keys):
        keys = any_key_to_string_array(keys)
        s = cls.search()
        s = s.filter('terms', key=keys)
        s = s.filter('term', stat='detachedvolumes').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0:
            return res['hits']['hits'][0]['_source']['data']
        return dict(total=0)

    @classmethod
    def latest_hourly_cpu_usage_by_tag(cls, key):
        s = cls.search()
        s = s.filter('term', key=key)
        s = s.filter('term', stat='hourlycpubytag').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0 and 'data' in res['hits']['hits'][0][
                '_source']:
            return res['hits']['hits'][0]['_source']['data']
        return dict(tags=[])

    @classmethod
    def latest_daily_cpu_usage_by_tag(cls, key):
        s = cls.search()
        s = s.filter('term', key=key)
        s = s.filter('term', stat='dailycpubytag').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0 and 'data' in res['hits']['hits'][0][
                '_source']:
            return res['hits']['hits'][0]['_source']['data']
        return dict(tags=[])

    @classmethod
    def latest_stopped_instances_report(cls, keys):
        keys = any_key_to_string_array(keys)
        s = cls.search()
        s = s.filter('terms', key=keys)
        s = s.filter('term', stat='stoppedinstancesreport').sort('-time')
        res = client.search(index='awsstat',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)

        if res['hits']['total'] > 0:
            return res['hits']['hits'][0]['_source']['data']
        return dict(total=0)
Exemplo n.º 24
0
class PublicationDoc(DocType):
    all_data = edsl.Text()
    id = edsl.Integer()
    title = edsl.Text(copy_to=ALL_DATA_FIELD)
    date_published = edsl.Date()
    last_modified = edsl.Date()
    code_archive_url = edsl.Keyword()
    doi = edsl.Keyword()
    contact_email = edsl.Keyword(copy_to=ALL_DATA_FIELD)
    container = edsl.Object(ContainerInnerDoc)
    tags = edsl.Nested(RelatedInnerDoc)
    sponsors = edsl.Nested(RelatedInnerDoc)
    platforms = edsl.Nested(RelatedInnerDoc)
    model_documentation = edsl.Keyword()
    authors = edsl.Nested(AuthorInnerDoc)

    @classmethod
    def from_instance(cls, publication):
        container = publication.container
        doc = cls(meta={'id': publication.id},
                  id=publication.id,
                  title=publication.title,
                  date_published=publication.date_published,
                  last_modified=publication.date_modified,
                  code_archive_url=publication.code_archive_url,
                  contact_email=publication.contact_email,
                  container=ContainerInnerDoc(id=container.id,
                                              name=container.name,
                                              issn=container.issn),
                  doi=publication.doi,
                  tags=[
                      RelatedInnerDoc(id=t.id, name=t.name)
                      for t in publication.tags.all()
                  ],
                  sponsors=[
                      RelatedInnerDoc(id=s.id, name=s.name)
                      for s in publication.sponsors.all()
                  ],
                  platforms=[
                      RelatedInnerDoc(id=p.id, name=p.name)
                      for p in publication.platforms.all()
                  ],
                  model_documentation=[
                      md.name for md in publication.model_documentation.all()
                  ],
                  authors=[
                      AuthorInnerDoc(id=a.id,
                                     name=a.name,
                                     orcid=a.orcid,
                                     researcherid=a.researcherid,
                                     email=a.email)
                      for a in publication.creators.all()
                  ])
        return doc.to_dict(include_meta=True)

    def get_public_detail_url(self):
        return reverse('core:public-publication-detail',
                       kwargs={'pk': self.meta.id})

    @classmethod
    def get_breadcrumb_data(cls):
        return {
            'breadcrumb_trail': [{
                'link': reverse('core:public-home'),
                'text': 'Home'
            }, {
                'text': 'Publications'
            }]
        }

    @classmethod
    def get_public_list_url(cls, search=None):
        location = reverse('core:public-search')
        if search:
            query_string = urlencode({'search': search})
            location += '?{}'.format(query_string)
        return location

    class Index:
        name = 'publication'
        settings = {'number_of_shards': 1}