def is_complete(self, interval, trendstore, filter=None, ratio=1): """ Returns False when trend data is considered incomplete for a specific interval. Trend data is considered to be complete if: Two row counts are done: row count for interval : (start, end) and a row count for the same interval a week earlier. The row counts are both non zero and their ratio is more than specified ratio """ complete = False row_count = partial(self.count, trendstore, filter=filter) count = row_count(interval) ref_count = row_count([get_previous_timestamp(ts, 7 * 86400) for ts in interval]) try: if count / ref_count >= ratio: complete = True except (ZeroDivisionError, TypeError): pass return complete
def test_previous_timestamp(): """ Test previous timestamp retrieval """ tz = pytz.timezone("Europe/Amsterdam") granularity = 3600 ts = tz.localize(datetime(2013, 4, 2, 10, 0, 0)) previous_timestamp = tz.localize(datetime(2013, 4, 2, 9, 0, 0)) assert_equal(get_previous_timestamp(ts, granularity), previous_timestamp) granularity = 86400 ts = get_most_recent_timestamp(tz.localize(datetime(2013, 4, 2, 10, 13, 0)), granularity) previous_timestamp = tz.localize(datetime(2013, 4, 1, 0, 0, 0)) assert_equal(get_previous_timestamp(ts, granularity), previous_timestamp)
def is_complete(self, interval, datasource, gp, entitytype_name, filter=None, ratio=1): """ Returns False when trend data is considered incomplete for a specific interval. Trend data is considered to be complete if: Two row counts are done: row count for interval : (start, end) and a row count for the same interval a week earlier. The row counts are both non zero and their ratio is more than specified ratio If ref row count is zero, the check is done for the interval one day earlier (instead of a week earlier) """ def _ratio(n, d): try: return n / d except (ZeroDivisionError, TypeError): return None complete = False row_count = partial(self.count, datasource, gp, entitytype_name, filter=filter) count = row_count(interval) ref_count = row_count([get_previous_timestamp(ts, 7 * 86400) for ts in interval]) complete = _ratio(count, ref_count) >= ratio # Plan B: Try to compare with day earlier if ref_count == 0: ref_count = row_count([get_previous_timestamp(ts, 1 * 86400) for ts in interval]) complete = _ratio(count, ref_count) >= ratio return complete
def aggregate(conn, schema, source, target, trend_names, timestamp): """ Basic aggregation of trend data :param conn: psycopg2 database connection :param schema: schema where source and target data is located :param source: tuple (datasource, gp, entitytype_name) specifying source :param target: tuple (datasource, gp, entitytype_name) specifying target :param trend_names: trends to aggregate :param timestamp: non-naive timestamp specifying end of interval to aggregate """ target_gp = target[1] interval = (get_previous_timestamp(timestamp, target_gp), timestamp) (ds, gp, et_name) = source source_table_names = get_table_names( [ds], gp, et_name, interval[0], interval[1]) target_table_name = make_table_name(*(target + (timestamp,))) if column_exists(conn, schema, source_table_names[-1], "samples"): select_samples_part = "SUM(samples)" select_samples_column = "samples," else: select_samples_part = "COUNT(*)" select_samples_column = "" select_parts = [] for source_table_name in source_table_names: select_parts.append( "SELECT " "entity_id, '{1}', {2} {3} " "FROM \"{0}\".\"{4}\" " "WHERE timestamp > %s AND timestamp <= %s ".format( schema, timestamp.strftime("%Y-%m-%d %H:%M:%S"), select_samples_column, ",".join(["\"{0}\"".format(tn) for tn in trend_names]), source_table_name)) query = ( "INSERT INTO \"{0}\".\"{1}\" (entity_id, timestamp, samples, {2}) " "SELECT entity_id, '{4}', {5}, {6} FROM " "( {3} ) AS sources " "GROUP BY entity_id".format( schema, target_table_name, ",".join(["\"{0}\"".format(tn) for tn in trend_names]), " UNION ALL ".join(select_parts), timestamp.strftime("%Y-%m-%d %H:%M:%S"), select_samples_part, ",".join(["SUM(\"{0}\")".format(tn) for tn in trend_names]))) retry = True attempt = 0 #Strategy followed in code below is like trend_storage.store() function while retry is True: retry = False attempt += 1 if attempt > MAX_RETRIES: raise MaxRetriesError("Max retries ({0}) reached".format(MAX_RETRIES)) try: with closing(conn.cursor()) as cursor: cursor.execute(query, len(source_table_names) * interval) except psycopg2.DatabaseError as exc: conn.rollback() columns = [("samples", "integer")] columns.extend(zip(trend_names, get_data_types(conn, schema, source_table_names[-1], trend_names))) if exc.pgcode == psycopg2.errorcodes.NUMERIC_VALUE_OUT_OF_RANGE: max_values = [] for source_table_name in source_table_names: query_max_values = ( "SELECT {0} FROM " "(SELECT " " {1} " "FROM \"{2}\".\"{3}\" " "WHERE timestamp > %s AND timestamp <= %s " "GROUP BY entity_id) AS sums" ).format( ",".join(["MAX(\"{0}\")".format(tn) for tn in trend_names]), ",".join(["SUM(\"{0}\") AS \"{0}\"".format(tn) for tn in trend_names]), schema, source_table_name) with closing(conn.cursor()) as cursor: cursor.execute(query_max_values, interval) max_values.append(cursor.fetchone()) data_types = [datatype.extract_from_value(v) for v in map(max, zip(*max_values))] check_column_types(conn, schema, target_table_name, trend_names, data_types) retry = True elif exc.pgcode == psycopg2.errorcodes.UNIQUE_VIOLATION: raise NonRecoverableError("{0}, {1!s} in query '{2}'".format( exc.pgcode, exc, query)) # TODO: remove unique violating record from target # retry = True elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_COLUMN: column_names, data_types = zip(*columns) add_missing_columns(conn, schema, target_table_name, zip(column_names, data_types)) retry = True elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE: column_names, data_types = zip(*columns) create_trend_table(conn, schema, target_table_name, column_names, data_types) retry = True else: raise NonRecoverableError("{0}, {1!s} in query '{2}'".format( exc.pgcode, exc, query)) else: conn.commit()