def read_influxdb_data(host='192.168.123.245', port=8086, dbname='c9377a95-82f3-4af3-ac14-40d14f6d2abe', ChannelName='1Y520210100', time_start='', time_end='', user='******', password='******', keyword=''): client = DataFrameClient(host, port, user, password, dbname) measurements = client.get_list_measurements() if keyword is None: keyword = '' if keyword == '': measurement = [ mea.get(u'name') for mea in measurements if mea.get(u'name').find(ChannelName) >= 0 ] else: measurement = [ mea.get(u'name') for mea in measurements if mea.get(u'name').find(ChannelName) >= 0 and mea.get(u'name').find(keyword) >= 0 ] if len(measurement) == 0: print('No data retrieved.') return None measurement = measurement[-1] time_end = 'now()' if time_end == '' else "'" + time_end + "'" time_start = 'now()' if time_start == '' else "'" + time_start + "'" querystr = 'select * from "{}" where time > {} and time < {}'.format( measurement, time_start, time_end) #print(querystr) df = client.query(querystr).get(measurement) client.close() if df is None: print('InfluxDB no data retrieved.') return None dff = df.groupby('id') columns = [name for name, group in dff] groups = [group['val'] for name, group in dff] #check datatime alginment: all([all(groups[i].index==groups[0].index) for i in range(1,len(groups))]) result = pd.concat(groups, axis=1) result.columns = columns result.index = groups[0].index return measurement, result
def test_get_list_measurements(self): """Test get list of measurements for TestInfluxDBClient object.""" cli = DataFrameClient(database='db') data = { "results": [{ "series": [ {"name": "measurements", "columns": ["name"], "values": [["cpu"], ["disk"] ]}]} ] } with _mocked_session(cli, 'get', 200, json.dumps(data)): self.assertListEqual( cli.get_list_measurements(), [{'name': 'cpu'}, {'name': 'disk'}] )
class tsdb(object): def __init__(self, dbname, host='localhost', port=8086, user='******', password='******'): self.host = host self.port = port self.user = user self.password = password self.dbname = dbname self.client = None self.protocol = 'json' def _connect(self): if self.client is None: self.client = DataFrameClient(host=self.host, port=self.port, username=self.user, password=self.password, database=self.dbname) #self.client.switch_database(self.dbname) def _disconnect(self): if self.cleint is not None: self.client.close() self.client = None def _reconnet(self): self._disconnect() self._connect() def create_db(self): self._connect() dbs = self.client.get_list_database() for e in dbs: if self.dbname in e.values(): logger.debug("Database {} is already exist.".format( self.dbname)) return logger.info("Creating database:{}".format(self.dbname)) self.client.create_database(self.dbname) #self._set_retantion_policy() def _set_retantion_policy(self): self._connect() self.client.create_retention_policy(name='raw', duration='12h', replication=1, default=True) self.client.create_retention_policy(name='cooked', duration='52w', replication=1, default=False) def check_db(self): self._connect() db = self.client.get_list_database() ms = self.client.get_list_measurements() rp = self.client.get_list_retention_policies(self.dbname) user = self.client.get_list_users() print('db: {}, measurements: {}'.format(db, ms)) print('retention policy: {}'.format(rp)) print('users: {}'.format(user)) def insert(self, df, measurement, tags=None): self._connect() try: result = self.client.write_points(df, measurement, tags=tags, time_precision='n', protocol=self.protocol) except: logger.info('influxdb write error') result = False return result def query(self, sql): self._connect() result = self.client.query(sql) return result
import pandas as pd from influxdb import DataFrameClient client = DataFrameClient(host='172.31.3.1', port=8086, database='aiops_logwarn') # 连接数据库 tables = list(map(lambda x: x['name'], client.get_list_measurements())) # 获取所有表名 count = 0 for table in tables: sql = "select * from \"%s\"" % table result = client.query(sql)[table] columns = result.columns df = result[columns[list( map(lambda x: False if '-' in x else True, columns))]] for column in df.columns: df2 = df[column].reset_index() df2['index'] = df2['index'].map(lambda x: x.timestamp() * 1000) df2.dropna().to_csv( "/Users/chenxilin/Code/Python/Python_Notes/数据库/influxdb/csv/%s.csv" % count, header=False, index=False) print(count) count += 1
class BaseDBHandler(object): """Handler for db.""" __version__ = 'v2' db_defaults = load_config(__version__).sql.base _config = AttrDict() _df_name = 'base_df' _columns = None @classmethod def default_config(cls): """Return default configurations. Returns: (AttrDict): default configurations """ return load_config(cls.__version__) def __init__(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None, df_name=None, read_on_init=True): """Initialize BaseDBHandler. Args: db_engine (str): database engine (if None, the one in the config file will be used) db_host (str): database HOST (if None, the one in the config file will be used) db_name (str): database name (if None, the one in the config file will be used) db_username (str): username (if None, the one in the config file will be used) db_password (str): password (if None, the one in the config file will be used) df_name (str): dataframe (table in DB) name (if None, class default value will be used) read_on_init (bool): if True, dataframe will be read from database on initialization """ self._cursor = 0 self.logger = logging.getLogger(__name__) if df_name is not None: self.df_name = df_name # Load config self._config = load_config(self.__version__) # Initialize database self._initialize_engine(db_engine, db_host, db_name, db_username, db_password) # Initialize dataframe self.df = self._initialize_df() # Fetch table if read_on_init: self.read() def __iter__(self): """Return iterator.""" return self def __next__(self): """Return the next item.""" if self._cursor >= len(self.df): self._cursor = 0 raise StopIteration() # Grab data data = self.df.take([self._cursor]).to_dict(orient='records')[0] # Delete internal column if 'uuid_in_df' in data.keys(): del data['uuid_in_df'] # Post-processes data = deserialize_dict_1d(data) # Increment self._cursor += 1 return data def __len__(self): """Return number of records.""" return len(self.df) def _initialize_engine(self, db_engine=None, db_host=None, db_name=None, db_username=None, db_password=None): """Initialize DB engine.""" # Parse engine = db_engine if db_engine is not None else self.db_defaults.engine username = db_username if db_username is not None else self.db_defaults.username password = db_password if db_password is not None else self.db_defaults.password host = db_host if db_host is not None else self.db_defaults.host database = db_name if db_name is not None else self.db_defaults.database # Substitute self._config.current_db = { 'engine': engine, 'host': host, 'username': username, 'password': password, 'database': database, } # Connect if engine in ['influxdb']: if not _extra_supports['influxdb']: raise ImportError('Module `influxdb` cannot be not imported') hostname = host hostport = 8086 if ':' in host: hostname, hostport = host.split(':') self._engine = DataFrameClient(hostname, hostport, username, password, database) else: engine = 'postgresql' if engine == 'timescaledb' \ else 'mysql' if engine == 'mariadb' \ else engine username_and_password = '' \ if all([username == '', password == '']) \ else '{0}:{1}@'.format(username, password) self._engine = sqlalchemy.create_engine('{0}://{1}{2}{3}'.format( engine, username_and_password if engine != 'sqlite' else '', '/{0}'.format(host) if engine == 'sqlite' else host, '' if engine == 'sqlite' else '/' + database), echo=False) self._session = scoped_session(sessionmaker(bind=self._engine)) def _initialize_df(self): """Initialize DF.""" df = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in self.columns ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) return df def _get_column_names(self): """Return column names.""" return [c['name'] for c in self.columns] def _get_sql_columns(self): return [sql.column(c) for c in self._get_column_names()] def _get_uuid_from_item(self, data_in): """Return UUID of the given item. Args: data_in (dict or pandas.Series): dict or Series containing data Returns: (str): UUID """ item = data_in if isinstance(item, pd.Series): item = data_in.to_dict() pre_hash = ''.join([ '{:.09f}'.format(item[c['name']]) if isinstance( item[c['name']], float) else str(item[c['name']]) for c in self.columns if c['name'] in item.keys() ]) pre_hash = pre_hash.encode('utf-8') uuid = hashlib.md5(pre_hash).hexdigest() return uuid def _preprocess_list_of_dicts(self, data_in): """Preprocess list of dicts. Args: data_in (list): list of dicts containing data Returns: (dict): dict of lists = listed dict """ data = deepcopy(data_in) # Serialize (convert list to str) self.logger.info('(Preprocess) Serializing...') for item in tqdm(data, desc='Serialization', leave=False): item = serialize_dict_1d(item) # Add df_uuid item['uuid_in_df'] = self._get_uuid_from_item(item) # Add missing columns for column in self.columns: column_name, column_dtype = column['name'], column['dtype'] if column_name not in item.keys(): item.update({column_name: None}) else: dtype_obj = dtype_string_to_dtype_object(column_dtype) if dtype_obj is None: continue if item[column_name] is not None \ and not isinstance(item[column_name], dtype_obj): try: item[column_name] = dtype_obj(item[column_name]) except ValueError: item[column_name] = np.nan # Convert dict to listed dict self.logger.info('(Preprocess) Converting...') data = dicts_to_listed_dict_2d(data) return data def _append_listed_dict_to_df(self, data, check_unique=False): """Append pre-processed dict to self._df. Args: data (dict): data to add check_unique (bool): if True, it will be checked that the data is unique in the db """ self.df = pd.concat([self.df, pd.DataFrame.from_dict(data)], sort=False) if check_unique: self.df.drop_duplicates('uuid_in_df', inplace=True) def add_data(self, data_in, **kwargs): """Add data to db. Args: data_in (dict): a dict containing data """ self.add_list_of_data([data_in], **kwargs) def add_list_of_data(self, data_in, **kwargs): """Add list of data to db. Args: data_in (list): a list of dicts containing data """ data = self._preprocess_list_of_dicts(data_in) self.logger.info('Adding data to DB...') self._append_listed_dict_to_df(data, **kwargs) self.logger.info('Successfully finished adding data to DB') def read(self, df_name=None, query=None, where=None, order_by=None, **kwargs): """Read data from SQL. Args: df_name (str): Dataframe name to read query (str SQL query or SQLAlchemy Selectable): query to select items where (str): query string for filtering items order_by (srt): column name to sort by **kwargs: kwargs for function `pandas.read_sql_query` or `influxdb.DataFrameClient.query` """ if df_name is not None: self.df_name = df_name # Create a query q = query if q is None: # Check if the table exits on DB if self._config.current_db['engine'] in ['influxdb']: if self.df_name not in [ entry['name'] for entry in self._engine.get_list_measurements() ]: self.df = self._initialize_df() return else: if not self._engine.dialect.has_table(self._engine, self.df_name): self.df = self._initialize_df() return # Create a sub-query for extracting unique records sub_q = sql.select('*', from_obj=sql.table(self.df_name)) if self._config.current_db['engine'] in ['mysql', 'sqlite']: sub_q = sub_q.group_by(sql.column('uuid_in_df')) elif self._config.current_db['engine'] in [ 'postgresql', 'timescaledb' ]: sub_q = sub_q.distinct(sql.column('uuid_in_df')) # Create a query if self._config.current_db['engine'] in ['influxdb']: q = sub_q else: q = sql.select('*', from_obj=sub_q.alias('temp')) if where is not None: q = q.where(sql.text(where)) if order_by is not None: q = q.order_by(sql.text(order_by)) # Read table from DB try: if self._config.current_db['engine'] in ['influxdb']: df = list(self._engine.query(str(q), **kwargs).values())[0] else: df = pd.read_sql_query(q, self._engine, **kwargs) except Exception as e: self.logger.warning( 'Could not execute SQL statement: "{0}" (reason: {1})'.format( str(q), str(e))) df = self._initialize_df() self.df = df def save(self, df=None, remove_duplicates=False, **kwargs): """Save data to SQL. Args: df (pandas.DataFrame): DataFrame to save (if None, self.df will be saved) remove_duplicates (bool): if True, duplicated rows will be removed **kwargs: kwargs for function `pandas.dataframe.to_sql` or `influxdb.DataFrameClient.write_points` """ dataframe = df if df is not None else self.df dataframe.drop_duplicates('uuid_in_df', inplace=True) if self._config.current_db['engine'] in ['influxdb']: self._engine.write_points(dataframe, self.df_name, **kwargs) else: if 'index' not in kwargs.keys(): kwargs.update({'index': False}) dataframe.to_sql(self.df_name, self._engine, if_exists='append', **kwargs) if remove_duplicates: if self._config.current_db['engine'] in ['influxdb']: logging.warning( 'Option "remove duplicates" is not supported yet.') return # Create temporal table temp_table_name = self.df_name + '_' + datetime.now().strftime( '%s') self._initialize_df().to_sql(temp_table_name, self._engine, index=False) # Select unique rows and insert into the temporal table if self._config.current_db['engine'] in ['mysql', 'sqlite']: select = 'select * from "{0}" group by uuid_in_df'.format( self.df_name) elif self._config.current_db['engine'] in [ 'postgresql', 'timescaledb' ]: select = 'select distinct * from "{0}"'.format(self.df_name) else: raise ValueError('Unsupported engine: {}'.format( self._engine.name)) q = 'insert into "{0}" {1}'.format(temp_table_name, select) self._engine.execute(q) # Drop deprecated table if exists if self._engine.dialect.has_table(self._engine, self.df_name + '_deprecated'): self._engine.execute('drop table "{0}"'.format(self.df_name + '_deprecated')) # Deprecate the original table self._engine.execute('alter table "{0}" rename to "{1}"'.format( self.df_name, self.df_name + '_deprecated')) # Rename the temporal table self._engine.execute('alter table "{0}" rename to "{1}"'.format( temp_table_name, self.df_name)) # Drop the deprecated table self._engine.execute('drop table "{0}"'.format(self.df_name + '_deprecated')) @property def df(self): """Return df.""" return self._df @df.setter def df(self, value): """Setter for self.df.""" if not isinstance(value, pd.DataFrame): raise ValueError('Only pandas dataframe is accepted.') # Set columns based on the given DF if len(self.columns) == 0 and len(value) > 0: self.columns = [{ 'name': c, 'dtype': 'none' } for c in value.columns.to_list()] # Add column 'uuid_in_df' if 'uuid_in_df' not in value.columns.to_list(): value['uuid_in_df'] = value.apply( lambda x: self._get_uuid_from_item(x), axis=1) self.columns += [{'name': 'uuid_in_df', 'dtype': 'str'}] self._df = value @property def df_name(self): """Return df_name.""" return self._df_name @df_name.setter def df_name(self, value): """Setter for self.df_name.""" self._df_name = value @property def columns(self): """Return columns of DF.""" if self._columns is not None: return self._columns try: return self._config[self.df_name]['columns'] except KeyError: return [] @columns.setter def columns(self, value): """Set self.columns.""" if not isinstance(value, list): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) if len(value) < 1: raise ValueError('At least one item must be in the list.') if not isinstance(value[0], dict) or "name" not in value[0].keys( ) or "dtype" not in value[0].keys(): raise ValueError( 'Columns must be a list of dicts with keys "name" and "dtype".' ) try: _ = pd.concat([ pd.Series(name=c['name'], dtype=dtype_string_to_dtype_object(c['dtype'])) for c in value ] + [pd.Series(name='uuid_in_df', dtype=str)], axis=1) except KeyError as e: raise ValueError('Unrecognized value: {}'.format(str(e))) self._columns = value
class InfluxDBConnector(object): def __init__(self, username='******', password='******', port=8086, database=None, host='localhost'): ''' :param username: user to connect :type username: str :param password: password of the user :type password: str :param port: port to connect to InfluxDB :type port: int :param database: database name to connect to :type database: str :param host: hostname to connect to InfluxDB :type host: str ''' self.username = username self.password = password self.port = port self.database = database self.host = host self.client = DataFrameClient(self.host, self.port, self.username, self.password, self.database) def create_database(self, database): """Create a new database in InfluxDB. :param database: the name of the database to create :type database: str """ self.client.create_database(database) def delete_database(self, database): """Delete a database from InfluxDB. :param database: the name of the database to drop :type database: str """ self.client.drop_database(database) def list_databases(self): """Get the list of databases in InfluxDB. :returns: all databases in InfluxDB :rtype: list of dictionaries """ return self.client.get_list_database() def list_measurements(self): """Get the list of measurements in database in InfluxDB :return: """ return self.client.get_list_measurements() def write_points(self, dataframe, measurement, tags=None, tag_columns=None, field_columns=None, time_precision=None, database=None, retention_policy=None, batch_size=None, protocol='line', numeric_precision=None): """Write to multiple time series names. :param dataframe: data points in a DataFrame :param measurement: name of measurement :param tags: dictionary of tags, with string key-values :param tag_columns: N/A. No description in API or source code? :param field_columns: N/A. No description in API or source code? :param time_precision: [Optional, default None] Either 's', 'ms', 'u' or 'n'. :param batch_size: [Optional] Value to write the points in batches instead of all at one time. Useful for when doing data dumps from one database to another or when doing a massive write operation :type batch_size: int :param database: the database to write the DataFrame to :type database: str :param retention_policy: N/A. No description in API or source code? :param protocol: Protocol for writing data. Either 'line' or 'json'. :type protocol: str :param numeric_precision: Precision for floating point values. Either None, 'full' or some int, where int is the desired decimal precision. 'full' preserves full precision for int and float datatypes. Defaults to None, which preserves 14-15 significant figures for float and all significant figures for int datatypes. :returns: True, if the write operation is successful :rtype: bool """ return self.client.write_points(dataframe, measurement, tags, tag_columns, field_columns, time_precision, database, retention_policy, batch_size, protocol, numeric_precision) def query(self, query, params=None, epoch=None, expected_response_code=200, database=None, raise_errors=True, chunked=False, chunk_size=0, dropna=True): """Send a query to InfluxDB into a DataFrame :param query: the actual query string :type query: str :param params: additional parameters for the request, defaults to {} :param epoch: response timestamps to be in epoch format either 'h', 'm', 's', 'ms', 'u', or 'ns',defaults to `None` which is RFC3339 UTC format with nanosecond precision :param expected_response_code: the expected status code of response, defaults to 200 :param database: database to query, defaults to None :type database: str :param raise_errors: Whether or not to raise exceptions when InfluxDB returns errors, defaults to True :param chunked: Enable to use chunked responses from InfluxDB. With ``chunked`` enabled, one ResultSet is returned per chunk containing all results within that chunk :param chunk_size: Size of each chunk to tell InfluxDB to use. :param dropna: drop columns where all values are missing :returns: the queried data :rtype: :class:`~.ResultSet` """ return self.client.query(query, params, epoch, expected_response_code, database, raise_errors, chunked, chunk_size, dropna)
class DataCollector: def __init__(self, host, port, user, password, database): self.host = host self.port = port self.user = user self.password = password self.database = database try: self.client = DataFrameClient(host, port, user, password, database, timeout=30) self.client.ping() self.experimentIds = self.cache_experimentIds() except requests.exceptions.ConnectTimeout as e: # Timeout of InfluxDB connection print(e) self.client = None def get_data(self, experimentId, measurements=[], fields=[], additional_clause=None, chunked=False, chunk_size=10000, limit=None, offset=None, max_lag="1s"): if not measurements: results = self.client.query(f"SHOW measurements WHERE (ExperimentId =~ /{experimentId}/ or ExecutionId =~ /{experimentId}/)") measurements = [item["name"] for item in results["measurements"]] measurements = ", ".join([f'"{item}"' for item in measurements]) fields = ', '.join([f'"{item}"' for item in fields]) if fields else '*' limit = f' LIMIT {limit}' if limit else '' offset = f' OFFSET {offset}' if offset else '' if not additional_clause: additional_clause = '' df = self.query_df(f'SELECT {fields} FROM {measurements} WHERE (ExperimentId =~ /{experimentId}/ or ExecutionId =~ /{experimentId}/){additional_clause}{limit}{offset}') df = df.set_index('time') df.index = pd.to_datetime(df.index).floor(max_lag) df = df.mean(level=0) # .dropna(axis=0) return df def get_experimentIds_for_measurement(self, measurement): result = self.client.query(f'SELECT distinct(ExecutionId) as ExecutionId from (SELECT * from "{measurement}")', chunked=False, chunk_size=1000, epoch='ns') return list(result[measurement].iloc[:, 0]) def get_measurements_for_experimentId(self, experimentId): result = self.client.query(f'SHOW measurements WHERE ExecutionId =~ /{experimentId}/ or ExperimentId =~ /{experimentId}/', chunked=False, chunk_size=1000, epoch='ns') return [item["name"] for item in list(result['measurements'])] def cache_experimentIds(self): experimentIds = [] measurements = [measurement['name'] for measurement in self.client.get_list_measurements()] for measurement in tqdm(measurements, desc="Getting ExecutionIds"): results = self.query_df(f'''SELECT distinct(ExecutionId) as ExecutionId from (SELECT * from "{measurement}")''') if not results.empty: experimentIds += list(results['ExecutionId'].astype(str)) return sorted(list(set(experimentIds))) def query_df(self, query): data = {} data['db'] = self.database data['u'] = self.user data['p'] = self.password data['precision'] = 'ns' data['q'] = query url_values = urllib.parse.urlencode(data) url = f"http://{self.host}:{self.port}/query?" + url_values request = urllib.request.Request(url, headers={'Accept': 'application/csv'}) response = urllib.request.urlopen(request) response_bytestr = response.read() if response_bytestr: return pd.read_csv(BytesIO(response_bytestr), sep=",", low_memory=False) else: return pd.DataFrame() def get_all_experimentIds(self): return self.experimentIds