def test_ingest_sprayday_hadoop_true(self, mock): """ Test that ingest_sprayday actually sends the expected payload to Druid when using hadoop index tasks """ file_url = "https://example.com/data.json" intervals = "2013-01-01/2013-01-02" with self.settings(DRUID_USE_INDEX_HADOOP=True): schema = get_sprayday_hadoop_schema() schema['spec']['dataSchema']['dataSource'] = sprayday_datasource schema['spec']['ioConfig']['inputSpec']['paths'] = file_url parse_spec = schema['spec']['dataSchema']['parser']['parseSpec'] parse_spec['dimensionsSpec'] = dimensions_spec timestamp_column = settings.DRUID_TIMESTAMP_COLUMN parse_spec['timestampSpec']['column'] = timestamp_column schema['spec']['dataSchema']['granularitySpec']['intervals'] =\ [intervals] schema_json = json.dumps(schema) ingest_sprayday(file_url, intervals) self.assertTrue(mock.called) args, kwargs = mock.call_args_list[0] self.assertEqual(args[0], schema_json) self.assertEqual(args[1], get_druid_indexer_url())
def get_historical_data(day=None, month=None, year=None): """ Gets and stores data based on one or all of day, month and year returns filename """ if any([year, month, day]): path = "/".join([str(x) for x in [year, month, day] if x is not None]) filename = "{datasource}/".format( datasource=settings.DRUID_SPRAYDAY_DATASOURCE) + path +\ "/sprayday.json" queryset = SprayDay.objects.all() if day: queryset = queryset.filter(spray_date__day=day) if month: queryset = queryset.filter(spray_date__month=month) if year: queryset = queryset.filter(spray_date__year=year) if queryset: intervals = get_druid_intervals(queryset) path = create_sprayday_druid_json_file(queryset=queryset, filename=filename) url = get_s3_url(path) return ingest_sprayday(url, intervals=intervals)
def get_data(minutes=settings.DRUID_BATCH_PROCESS_TIME_INTERVAL): """ Gets data submitted in the last x minutes and stores it returns filename """ queryset = get_sprayday_queryset_from_x_minutes(minutes) if queryset: # get intervals first = queryset.first().data['_submission_time'] last = queryset.last().data['_submission_time'] intervals = get_druid_intervals(queryset, use_timestamp=True) filename = "{datasource}/minutes".format( datasource=settings.DRUID_SPRAYDAY_DATASOURCE) + \ "/sprayday-{start_time}-{end_time}.json".format(start_time=first, end_time=last) path = create_sprayday_druid_json_file(queryset=queryset, filename=filename) url = get_s3_url(path) return ingest_sprayday(url, intervals=intervals)
def handle(self, *args, **options): path = options['path'] ingest_sprayday(path)