def prepare_training_data(self, parameters): end_time = str_to_dt(parameters['endTime']) if 'startTime' in parameters: start_time = str_to_dt(parameters['startTime']) else: start_time = end_time factor_def = parameters['seriesSets'] factors_data = self.tsanaclient.get_timeseries(factor_def, start_time, end_time) time_key = dt_to_str_file_name(end_time) data_dir = os.path.join(self.config.model_data_dir, time_key, str(uuid.uuid1())) shutil.rmtree(data_dir, ignore_errors=True) os.makedirs(data_dir, exist_ok=True) variable = {} for factor in factors_data: csv_file = factor.series_id + '.csv' csv_data = [] csv_data.append(('timestamp', 'value')) csv_data.extend([(tuple['timestamp'], tuple['value']) for tuple in factor.value]) save_to_csv(csv_data, os.path.join(data_dir, csv_file)) variable[factor.series_id] = csv_file zip_dir = os.path.abspath(os.path.join(data_dir, os.pardir)) zip_file_base = os.path.join(zip_dir, 'training_data') zip_file = zip_file_base + '.zip' if os.path.exists(zip_file): os.remove(zip_file) shutil.make_archive(zip_file_base, 'zip', data_dir) azure_blob = AzureBlob(self.config.az_tsana_model_blob_connection) container_name = self.config.tsana_app_name azure_blob.create_container(container_name) blob_name = 'training_data_' + time_key with open(zip_file, "rb") as data: azure_blob.upload_blob(container_name, blob_name, data) os.remove(zip_file) blob_url = AzureBlob.generate_blob_sas( self.config.az_storage_account, self.config.az_storage_account_key, container_name, blob_name) result = {} result['variable'] = variable result['fillUpMode'] = parameters['instance']['params']['fillUpMode'] result['tracebackWindow'] = parameters['instance']['params'][ 'tracebackWindow'] #result['source'] = blob_url result['source'] = '/data/training_data.zip' result['startTime'] = dt_to_str(start_time) result['endTime'] = dt_to_str(end_time) return result
def get_timeseries(self, api_key, series_sets, start_time, end_time, offset=0, granularityName=None, granularityAmount=0, top=1): if offset != 0 and granularityName is None: offset = 0 end_str = dt_to_str(end_time) start_str = dt_to_str(start_time) dedup = {} series = [] # Query each series's tag for data in series_sets: dim = {} if 'dimensionFilter' not in data: data['dimensionFilter'] = data['filters'] for dimkey in data['dimensionFilter']: dim[dimkey] = [data['dimensionFilter'][dimkey]] para = dict(metricId=data['metricId'], dimensions=dim, count=top, startTime=start_str) ret = self.post(api_key, '/metrics/' + data['metricId'] + '/rank-series', data=para) for s in ret['value']: if s['seriesId'] not in dedup: s['seriesSetId'] = data['seriesSetId'] s['startTime'] = start_str s['endTime'] = end_str s['dimension'] = s['dimensions'] del s['dimensions'] series.append(s) dedup[s['seriesId']] = True # Query the data multi_series_data = None if len(series) > 0: ret = self.post(api_key, '/metrics/series/data', data=dict(value=series)) if granularityName is not None: multi_series_data = [ Series(factor['id']['metricId'], series[idx]['seriesSetId'], factor['id']['dimension'], [dict(timestamp=get_time_offset(str_to_dt(y[0]), (granularityName, granularityAmount), offset) , value=y[1]) for y in factor['values']]) for idx, factor in enumerate(ret['value']) ] else: multi_series_data = [ Series(factor['id']['metricId'], series[idx]['seriesSetId'], factor['id']['dimension'], value=[dict(timestamp=y[0] , value=y[1]) for y in factor['values']]) for idx, factor in enumerate(ret['value']) ] else: log.info("Series is empty") return multi_series_data
def prepare_inference_data(self, parameters): start_time, end_time = self.get_data_time_range(parameters) factor_def = parameters['seriesSets'] factors_data = self.tsanaclient.get_timeseries(parameters['apiKey'], factor_def, start_time, end_time) time_key = dt_to_str_file_name(end_time) data_dir = os.path.join(self.config.model_data_dir, time_key, str(uuid.uuid1())) shutil.rmtree(data_dir, ignore_errors=True) os.makedirs(data_dir, exist_ok=True) try: for factor in factors_data: csv_file = factor.series_id + '.csv' csv_data = [] csv_data.append(('timestamp', 'value')) csv_data.extend([(tuple['timestamp'], tuple['value']) for tuple in factor.value]) save_to_csv(csv_data, os.path.join(data_dir, csv_file)) zip_dir = os.path.abspath(os.path.join(data_dir, os.pardir)) zip_file_base = os.path.join(zip_dir, 'inference_data') zip_file = zip_file_base + '.zip' if os.path.exists(zip_file): os.remove(zip_file) shutil.make_archive(zip_file_base, 'zip', data_dir) azure_blob = AzureBlob(self.config.az_tsana_model_blob_connection) container_name = self.config.tsana_app_name azure_blob.create_container(container_name) blob_name = 'inference_data_' + time_key with open(zip_file, "rb") as data: azure_blob.upload_blob(container_name, blob_name, data) os.remove(zip_file) blob_url = AzureBlob.generate_blob_sas( self.config.az_storage_account, self.config.az_storage_account_key, container_name, blob_name) result = {} result['source'] = blob_url result['startTime'] = dt_to_str(start_time) result['endTime'] = dt_to_str(end_time) return result finally: shutil.rmtree(data_dir, ignore_errors=True)
def get_inference_result(self, parameters, start_time, end_time): try: ret = self.get(parameters['apiKey'], '/timeSeriesGroups/' + parameters['groupId'] + '/appInstances/' + parameters['instance']['instanceId'] + '/history?startTime=' + dt_to_str(start_time) + '&endTime=' + dt_to_str(end_time)) return STATUS_SUCCESS, '', ret except Exception as e: traceback.print_exc(file=sys.stdout) return STATUS_FAIL, str(e), None
def do_verify(self, subscription, parameters): # ------TO BE REPLACED: Other application just replace below part------- # For forecast, check the factors and target has same granularity, and each factor could only contain one series meta = self.tsanaclient.get_metric_meta(parameters['apiKey'], parameters['instance']['params']['target']['metricId']) if meta is None: return STATUS_FAIL, 'Target is not found. ' target_gran = meta['granularityName'] # Only for custom, the granularity amount is meaningful which is the number of seconds target_gran_amount = meta['granularityAmount'] for data in parameters['seriesSets']: if target_gran != data['metricMeta']['granularityName'] or (target_gran == 'Custom' and target_gran_amount != data['metricMeta']['granularityAmount']): return STATUS_FAIL, 'Granularity must be identical between target and factors. ' # Check series count, and each factor should only contain 1 series seriesCount = 0 for data in parameters['seriesSets']: dim = {} for dimkey in data['dimensionFilter']: dim[dimkey] = [data['dimensionFilter'][dimkey]] dt = dt_to_str(str_to_dt(meta['dataStartFrom'])) para = dict(metricId=data['metricId'], dimensions=dim, count=2, startTime=dt) # Let's said 100 is your limitation ret = self.tsanaclient.post(parameters['apiKey'], '/metrics/' + data['metricId'] + '/rank-series', data=para) if ret is None or 'value' not in ret: return STATUS_FAIL, 'Read series rank filed. ' seriesCount += len(ret['value']) if len(ret['value']) != 1 or seriesCount > self.config.series_limit: return STATUS_FAIL, 'Cannot accept ambiguous factors or too many series in the group, limit is ' + str(self.config.series_limit) + '.' return STATUS_SUCCESS, ''
def do_verify(self, subscription, parameters): # Check series count, and each factor should only contain 1 series seriesCount = 0 for data in parameters['seriesSets']: dim = {} for dimkey in data['dimensionFilter']: dim[dimkey] = [data['dimensionFilter'][dimkey]] meta = self.tsanaclient.get_metric_meta(parameters['apiKey'], data['metricId']) if meta is None: return STATUS_FAIL, 'Metric {} is not found.'.format( data['metricId']) dt = dt_to_str(str_to_dt(meta['dataStartFrom'])) para = dict(metricId=data['metricId'], dimensions=dim, count=2, startTime=dt) # Let's said 100 is your limitation ret = self.tsanaclient.post(parameters['apiKey'], '/metrics/' + data['metricId'] + '/rank-series', data=para) if ret is None or 'value' not in ret: return STATUS_FAIL, 'Read series rank failed.' if len(ret['value']) == 0: return STATUS_FAIL, "Data not found for {}".format(para) seriesCount += len(ret['value']) if len(ret['value'] ) != 1 or seriesCount > self.config.series_limit: return STATUS_FAIL, 'Cannot accept ambiguous factors or too many series in the group, limit is ' + str( self.config.series_limit) + '.' return STATUS_SUCCESS, ''
def save_inference_result(self, parameters, result): try: if len(result) <= 0: return STATUS_SUCCESS, '' body = { 'groupId': parameters['groupId'], 'instanceId': parameters['instance']['instanceId'], 'results': [] } for item in result: item['timestamp'] = dt_to_str(str_to_dt(item['timestamp'])) body['results'].append({ 'params': parameters['instance']['params'], 'timestamp': item['timestamp'], 'result': item, 'status': InferenceState.Ready.name }) self.post(parameters['apiKey'], '/timeSeriesGroups/' + parameters['groupId'] + '/appInstances/' + parameters['instance']['instanceId'] + '/saveResult', body) return STATUS_SUCCESS, '' except Exception as e: traceback.print_exc(file=sys.stdout) return STATUS_FAIL, str(e)
def prepare_inference_data(self, parameters): end_time = str_to_dt(parameters['endTime']) if 'startTime' in parameters: start_time = str_to_dt(parameters['startTime']) else: start_time = end_time factor_def = parameters['seriesSets'] factors_data = self.tsanaclient.get_timeseries(factor_def, start_time, end_time) variable = {} for factor in factors_data: variable[factor.series_id] = factor.value result = {} result['data'] = variable result['startTime'] = dt_to_str(start_time) result['endTime'] = dt_to_str(end_time) return result
def do_inference(self, model_dir, parameters, context): results = [] factors_data = self.prepare_inference_data(parameters) start_time, end_time, gran = self.get_inference_time_range(parameters) traceback_window = parameters['instance']['params']['tracebackWindow'] for timestamp in get_time_list(start_time, end_time, gran): single_point = [] for factor in factors_data: x = np.array([ tuple['timestamp'].timestamp() for tuple in factor.value if tuple['timestamp'] < timestamp ])[-traceback_window:].reshape(-1, 1) y = np.array([ tuple['value'] for tuple in factor.value if tuple['timestamp'] < timestamp ])[-traceback_window:] model = linear_model.LinearRegression().fit(x, y) y_new = model.predict(x) single_point.append( dict(seriesId=factor.series_id, value=model.predict( np.array([timestamp.timestamp()]).reshape(-1, 1))[0], mse=mean_squared_error(y, y_new), r2score=r2_score(y, y_new))) results.append( dict(timestamp=dt_to_str(timestamp), status=InferenceState.Ready.name, result=single_point)) status, message = self.tsanaclient.save_inference_result( parameters, results) if status != STATUS_SUCCESS: raise Exception(message) return STATUS_SUCCESS, ''
def do_inference(self, subscription, model_id, model_dir, parameters): log.info("Start to inference %s", model_dir) inference_window = parameters['instance']['params']['windowSize'] meta = self.tsanaclient.get_metric_meta(parameters['apiKey'], parameters['instance']['params']['target']['metricId']) if meta is None: return STATUS_FAIL, 'Metric is not found. ' end_time = str_to_dt(parameters['endTime']) if 'startTime' in parameters: start_time = str_to_dt(parameters['startTime']) else: start_time = end_time cur_time = start_time data_end_time = get_time_offset(end_time, (meta['granularityName'], meta['granularityAmount']), + 1) data_start_time = get_time_offset(start_time, (meta['granularityName'], meta['granularityAmount']), - inference_window * 2) factor_def = parameters['seriesSets'] factors_data = self.tsanaclient.get_timeseries(parameters['apiKey'], factor_def, data_start_time, data_end_time) target_def = [parameters['instance']['params']['target']] target_data = self.tsanaclient.get_timeseries(parameters['apiKey'], target_def, data_start_time, data_end_time) model, window = load_inference_model(model_dir=model_dir, target_size=parameters['instance']['params']['step'], window=inference_window, metric_sender=MetricSender(self.config, subscription, model_id), epoc=parameters['instance']['params']['epoc'] if 'epoc' in parameters[ 'instance'][ 'params'] else self.config.lstm['epoc'], validation_freq=parameters['instance']['params']['validation_freq'] if 'validation_freq' in parameters[ 'instance'][ 'params'] else self.config.lstm['validation_freq'], validation_ratio=parameters['instance']['params']['validation_ratio'] if 'validation_ratio' in parameters[ 'instance'][ 'params'] else self.config.lstm['validation_ratio']) input_data = load_inference_input_data(target_series=target_data[0],factor_series=factors_data, model=model, gran=Gran[meta['granularityName']], custom_in_seconds=meta['granularityAmount'], fill_type=Fill[parameters['instance']['params']['fill']] if 'fill' in parameters[ 'instance'][ 'params'] else Fill.Previous, fill_value=parameters['instance']['params']['fillValue'] if 'fillValue' in parameters[ 'instance'][ 'params'] else 0) while cur_time <= end_time: try: result = inference(input_data=input_data, window=window, timestamp=cur_time, target_size=parameters['instance']['params']['step'], model=model) if len(result) > 0: # offset back if 'target_offset' in parameters['instance']['params']: offset = int(parameters['instance']['params']['target_offset']) for idx in range(len(result)): result[idx]['timestamp'] = dt_to_str(get_time_offset(cur_time, (meta['granularityName'], meta['granularityAmount']), - offset + idx)) # print(result[idx]['timestamp']) self.tsanaclient.save_inference_result(parameters, result) else: log.error("No result for this inference %s, key %s" % (dt_to_str(cur_time), model_dir)) # process = psutil.Process(os.getpid()) # print(process.memory_info().rss) except Exception as e: log.error("-------Inference exception-------") cur_time = get_time_offset(cur_time, (meta['granularityName'], meta['granularityAmount']), + 1) return STATUS_SUCCESS, ''