Exemplo n.º 1
0
class DruidLoader(BaseLoader):
    def __init__(
        self,
        url="https://druid.broker.develop.otonomousmobility.com/",
        endpoint="druid/v2",
        datasource="mytaxi_gps_probes_index_parallel_v4",
        username=None,
        password=None,
    ):
        super().__init__("druid")
        self.url = url
        self.endpoint = endpoint
        self.datasource = datasource
        self.connector = PyDruid(url, endpoint)
        self.connector.set_basic_auth_credentials(
            username or os.environ["USERNAME"], password
            or os.environ["PASSWORD"])

        interval = self.connector.time_boundary(
            datasource=self.datasource).result[0]["result"]
        self.interval = f'{interval["minTime"]}/{interval["maxTime"]}'
        self.default_query = {
            "datasource": self.datasource,
            "granularity": "all",
            "intervals": self.interval,
            "paging_spec": {
                "paging_identifiers": {},
                "threshold": 100
            },
        }

    def load(self, **kwargs):
        query = deepcopy(self.default_query)
        query.update(kwargs)
        for trace in self.connector.select(**query):
            probes = [
                Probe.from_druid(probe) for probe in trace["result"]["events"]
            ]
            yield Trace(probes, identifier=self._extract_booking_id(trace))

    @staticmethod
    def _extract_booking_id(trace):
        probe_groups = {
            k: len(list(v))
            for k, v in itertools.groupby(
                trace["result"]["events"],
                key=lambda event: event["event"]["bookingid"])
        }
        if len(probe_groups) > 1:
            raise ValueError(
                f"Trace has probes from different bookings: {probe_groups.keys()}"
            )

        return list(probe_groups.keys())[0]
Exemplo n.º 2
0
def query_druid():
    client = PyDruid(DRUID_URL, 'druid/v2')
    query = client.select(
        datasource='pageviews1',
        granularity='all',
        dimensions=["url", "user"],
        filter=Dimension('user') == 'ethan',
        paging_spec={"pagingIdentifiers": {}, "threshold": 5},
        intervals=["2016-07-08/2017-09-13"]
    )
    # print json.dumps(query.result, indent=2)
    return query.result
Exemplo n.º 3
0
class MeDruidHelper(object):
    """
    Market Events on Druid Helper
    Auxilary class for working with Market Events in Druid
    """
    events_dir = 'G:/work'
    in_vm_dir = '/mnt/hgfs/G/work'

    def __init__(self):
        self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2')

    @staticmethod
    def index_market_events(file_name, market_events):
        """
        Creates data file from list of market_events at location accessible to Druid and submits indexing task

        :type file_name: Union[str,unicode]
        :type market_events: list

        :param file_name: name of the data file
        :param market_events: list of events
        :return:
        """

        task_proto_path = base_path + '/market_event_indexing_task_proto.json'
        with open(task_proto_path) as fh:
            indexing_task_spec = json.loads(fh.read())
        if indexing_task_spec is None:
            raise DruidPocException('unable to load indexing task proto from ' + task_proto_path)

        # model for indexing task is needed for production use
        indexing_task_spec['spec']['ioConfig']['inputSpec']['paths'] = MeDruidHelper.in_vm_dir + '/' + file_name

        with open(MeDruidHelper.events_dir + '/' + file_name, 'w') as events_fh:
            for event in market_events:
                events_fh.write(json.dumps(vars(event), sort_keys=True) + '\n')

        MeDruidHelper.submit_synchronous_indexing_task(indexing_task_spec)

    @staticmethod
    def submit_synchronous_indexing_task(indexing_task_spec):
        submit_response = requests.post(OVERLORD_URL, headers={'Content-Type': 'application/json'},
                                        data=json.dumps(indexing_task_spec))
        if submit_response.status_code == 200 and submit_response.reason == 'OK':
            task_id = json.loads(submit_response.text)['task']
            tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id)
            print 'Indexing should begin shortly. Tracking URL: %s' % tracking_url
            MeDruidHelper.track_indexing_task(task_id)
        else:
            print 'Failed submitting task, reason:' + submit_response.reason

    @staticmethod
    def track_indexing_task(task_id):
        tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id)
        status_response = requests.get(tracking_url)
        print status_response.json()
        task_status = status_response.json()['status']['status']
        while status_response.status_code == 200 and task_status not in ['SUCCESS', 'FAILED']:
            time.sleep(10)
            status_response = requests.get(tracking_url)
            task_status = status_response.json()['status']['status']
            print '[%d] %s - %s' % (status_response.status_code, task_status, status_response.json())

    @staticmethod
    def post_to_tranquility(record, table_name=TABLE_NAME):
        """
        used for streaming into Druid through tranquility
        :param record:
        :param table_name:
        :return:
        """
        payload = json.dumps(record.__dict__)
        print payload
        load_response = requests.post(url=TRANQUILITY_URL + '/' + table_name,
                                      headers={'Content-Type': 'application/json'},
                                      data=payload)
        print "[%d] %s\n" % (load_response.status_code, load_response.text)

    @staticmethod
    def shutdown_streaming_task(task_id):
        task_shutdown_url = '%s/%s/shutdown' % (OVERLORD_URL, task_id)
        response = requests.post(task_shutdown_url)
        print '[%d] %s' % (response.status_code, response.json())

    def select_one_market_event(self, product_name):
        query = self.client.select(
            datasource=TABLE_NAME,
            granularity='all',
            dimensions=['product_name'],
            filter=Dimension('product_name') == product_name,
            paging_spec={"pagingIdentifiers": {}, "threshold": 1},
            intervals=["2016-07-08/2017-09-13"]
        )

        events = [segment_result['result']['events'] for segment_result in query.result]
        if len(events) >= 1:
            return events[0]
        return []

    def positions_delta(self, product_name, min_num_employees, start_dt, end_dt):
        """
        :type product_name: Union[str,unicode]
        :type min_num_employees: int
        :type start_dt: datetime
        :type end_dt: datetime
        """
        query = self.client.timeseries(
            datasource=TABLE_NAME,
            granularity='month',
            intervals=[start_dt.strftime(YMD_FORMAT) + '/' + end_dt.strftime(YMD_FORMAT)],
            filter=((Dimension('product_name') == product_name) &
                    (Dimension('customer_num_employees') > min_num_employees)),
            aggregations={"qty": doublesum("qty")},
        )
        print query.result
        delta = 0
        for item in query.result:
            delta += item['result']['qty']
        return delta

    @staticmethod
    def yesterday():
        return (datetime.now() - timedelta(days=1)).strftime(YMD_FORMAT)
Exemplo n.º 4
0
class DruidAccessLayer(object):
    timeseries_granularities = ['none', 'second', 'minute',
                                'fifteen_minute', 'thirty_minute', 'hour',
                                'day', 'week', 'month', 'quarter', 'year']

    select_granularities = ['all', 'second', 'minute',
                            'fifteen_minute', 'thirty_minute', 'hour',
                            'day', 'week', 'month', 'quarter', 'year']

    def __init__(self):
        self.connection = None
        self.plyql = None

    def connect(self, uri):
        self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/')
        self.plyql = PlyQL(uri)
        try:
            tables = self.tables()
            if {'Tables_in_database': 'supervisor'} not in tables:
                raise Exception('Druid connection error: missing '
                                '"supervisor" table')
        except Exception:
            raise Exception('Druid connection error: {0}'.format(uri))

    def __validate_granularity__(self, granularity, supported_granularities):
        if granularity in self.timeseries_granularities:
            query_granularity = granularity
        elif validators.duration(granularity):
            query_granularity = {'type': 'period', 'period': granularity}
        else:
            raise ValueError(
                'Unsupported granularity "{0}"'.format(granularity))
        return query_granularity

    def __validate_intervals__(self, intervals):
        if not validators.interval(intervals):
            raise ValueError('Unsupported interval "{0}"'.format(intervals))
        return intervals

    def tables(self):
        return self.plyql.query('SHOW TABLES')

    def processes(self, agent_id, period='P6W'):
        return self.plyql.query('SELECT process_name AS process, '
                                'COUNT() AS count, MAX(__time) AS time '
                                'FROM supervisor WHERE agent_id = "{0}" '
                                'GROUP BY process_name;'
                                .format(agent_id), period)

    def timeseries(self, agent_id, process_name, granularity='none',
                   intervals='P6W', descending=False):
        query_granularity = self.__validate_granularity__(
            granularity, self.timeseries_granularities)
        intervals = self.__validate_intervals__(intervals)

        return self.connection.timeseries(
            datasource='supervisor',
            granularity=query_granularity,
            descending=descending,
            intervals=intervals,
            aggregations={'cpu': doublemax('cpu'),
                          'mem': longmax('mem')},
            context={'skipEmptyBuckets': 'true'},
            filter=(Dimension('agent_id') == agent_id) &
            (Dimension('process_name') == process_name))

    def select(self, agent_id, process_name, granularity='all',
               intervals='P6W', descending=True):
        query_granularity = self.__validate_granularity__(
            granularity, self.select_granularities)
        intervals = self.__validate_intervals__(intervals)

        return self.connection.select(
            datasource='supervisor',
            granularity=query_granularity,
            intervals=intervals,
            descending=descending,
            dimensions=['process_name'],
            metrics=['cpu', 'mem'],
            filter=(Dimension('agent_id') == agent_id) &
            (Dimension('process_name') == process_name),
            paging_spec={'pagingIdentifiers': {}, "threshold": 1}
        )