def test_consolidate_data(self): """ Test dataframe consolidation function """ midx = pd.MultiIndex(levels=[[0, 1, 2, 3, 4], ['sys1']], labels=[[0, 1, 2, 3, 4], [0, 0, 0, 0, 0]], names=[df_tools.DATETIME_TAG, 'system']) df1 = pd.DataFrame(np.random.randint(0, 10, (5, 3)), columns=['A', 'B', 'C']) df1.index.name = df_tools.DATETIME_TAG df1_midx = df1.set_index(midx) df2 = pd.DataFrame(np.random.randint(0, 10, (5, 3)), columns=['A', 'B', 'C']) df2.index.name = df_tools.DATETIME_TAG # Consolidate with nothing should raise a ToDfError with self.assertRaises(df_tools.ToDfError): df_tools.consolidate_data(df1) # Consolidate with a system name should return MultiIndex dataframe assert_frame_equal(df_tools.consolidate_data(df1.copy(), system='sys1'), df1_midx) data = pd.DataFrame() for (i, partial_dataframe) in enumerate([df1, df2]): data = df_tools.consolidate_data(partial_dataframe, dataframe=data, system='sys{}'.format(i + 1)) self.assertTupleEqual(data.shape, (10, 3)) self.assertTupleEqual(data.index.levshape, (5, 2)) assert_frame_equal(df1, data.xs('sys1', level='system')) assert_frame_equal(df2, data.xs('sys2', level='system'))
def test_plotvar(self): """ Test function for plot_var """ dataframe = df_tools.consolidate_data(self.test_data, system='SYSTEM1') # make a plot filtering by system, uses dataframe.plot() myplot = gen_plot.plot_var(dataframe, 'FRONTEND_11_OUTPUT_OK', system='SYSTEM1', logger=self.logger) self.assertTrue(myplot.has_data()) self.assertTrue(myplot.is_figure_set()) # make a plot without filters, uses matplotlib.pyplot.plot() myplot = gen_plot.plot_var(dataframe, 'FRONTEND_11_OUTPUT_OK', logger=self.logger) self.assertTrue(myplot.has_data()) self.assertTrue(myplot.is_figure_set()) # Selecting a non existing system should return an empty plot voidplot = gen_plot.plot_var(dataframe, 'FRONTEND_11_OUTPUT_OK', system='SYSTEM2', logger=self.logger) self.assertFalse(voidplot.has_data()) # now with an empty dataframe, should return None voidplot = gen_plot.plot_var(pd.DataFrame(), 'DONTCARE', logger=self.logger) self.assertFalse(voidplot.has_data()) # same when trying to plot a non-existing variable voidplot = gen_plot.plot_var(dataframe, 'DONTCARE', logger=self.logger) self.assertFalse(voidplot.has_data())
def test_dataframe_to_t4csv(self): """ Test reverse conversion (dataframe to T4-CSV) """ t = tempfile.NamedTemporaryFile() t4files = df_tools.dataframe_to_t4csv(dataframe=self.test_data, output=t.name) self.assertTrue(len(t4files) > 0) that = self.collector_test.get_stats_from_host(list(t4files.values())) that = df_tools.consolidate_data(that, system=list(t4files.keys())[0]) assert_frame_equal(self.test_data, that)
def _single_day_and_system_data(system, given_date=None): given_date = get_datetag(given_date) self.logger.info( 'Collecting data for system: {0}; day: {1}'.format( system, given_date)) with self.get_sftp_session(system) as session: result_data = self.get_system_data(session, system, given_date) self.data = df_tools.consolidate_data(result_data, dataframe=self.data, system=system) self.results_queue.put(system) # flag this system as done
def _single_day_and_system_data(system, given_date=None): given_date = get_datetag(given_date) self.logger.info('Collecting data for system: {0}; day: {1}' .format(system, given_date)) with self.get_sftp_session(system) as session: result_data = self.get_system_data(session, system, given_date) self.data = df_tools.consolidate_data(result_data, dataframe=self.data, system=system) self.results_queue.put(system) # flag this system as done
def create_reports_from_local(self, data_file, pkl=True, plain=False, system=None, **kwargs): """ Generate HTML files from data stored locally Arguments: data_file (str): Data filename Keyword Arguments: pkl (boolean or True): indicate if data is a pickled dataframe or a CSV plain (boolean or False): when ``pkl==False``, indicate if the CSV is a plain (aka excel format) or a T4-compliant CSV system (str): Indicate the system name of the input data, important when data comes in CSV format (``pkl==False``) """ # load the input file if not os.path.exists(data_file): self.logger.error('{0} file {1} cannot be found' .format('PKL' if pkl else 'CSV', data_file)) raise IOError if pkl: _collector = collector.read_pickle(data_file, logger=self.logger) self.data = _collector.data self.logs = _collector.logs if system: self.systems = system if isinstance(system, list) else [system] else: self.systems = _collector.systems else: # CSV if not system: system = os.path.splitext(os.path.basename(data_file))[0] self.data = df_tools.reload_from_csv(data_file, plain=plain) self.data = df_tools.consolidate_data(self.data, system=system) self.systems = system if isinstance(system, list) else [system] # Populate the log info with fake data for system in self.systems: self.logs[system] = 'Log collection omitted for '\ 'locally generated reports at '\ '{0} for {1}'.format(self.date_tag(), system) self.logger.info(self.logs[system]) # Create the reports self._reports_generator() self.logger.info('Done!')
def test_getstats(self): """ Test function for get_stats_from_host """ df1 = self.collector_test.get_stats_from_host(filespec_list=TEST_CSV) col = collector.read_pickle(TEST_PKL) df2 = col.data df2.clean_calcs(TEST_CALC) # undo calculations df1 = df_tools.consolidate_data( df1, system=df2.index.get_level_values('system').unique()[0]) self.assertIsInstance(df1, pd.DataFrame) self.assertIsInstance(df2, pd.DataFrame) assert_frame_equal(df1, df2)
def get_data_and_logs(self, system): """ Collect everything needed for a system. By default the connection is done via SSH tunnels. Arguments: system (str): Open an SFTP session to system and collect the CSVs Return: ``pandas.DataFrame`` """ # TODO: allow parallel (data | log) collection try: self.logger.info('{0} | Collecting statistics...'.format(system)) if ( self._check_if_using_gateway(system) and not self.check_if_tunnel_is_up(system) ): self.logger.error('{0} | System not reachable!'.format(system)) raise SFTPSessionError # Get an sftp session sftp_session = self.get_sftp_session(system) if not sftp_session: raise SFTPSessionError('Cannot open an SFTP session to {0}' .format(system)) with sftp_session as session: # open the session # Get data from the remote system result_data = self.get_system_data(session, system) # Done gathering data, now get the logs if self.nologs or result_data.empty \ or not self.conf.has_option('MISC', 'remote_log_cmd'): result_logs = '{0} | Log collection omitted'.format(system) self.logger.info(result_logs) else: result_logs = self.get_system_logs( sftp_session.ssh_transport, system, self.conf.get('MISC', 'remote_log_cmd') ) or '{0} | Missing logs!'.format(system) except (IOError, SFTPSessionError): result_data = pd.DataFrame() result_logs = 'Could not get information from this system' self.logger.debug('{0} | Consolidating results'.format(system)) self.data = df_tools.consolidate_data(result_data, dataframe=self.data, system=system) self.logs[system] = result_logs self.results_queue.put(system)
def get_data_and_logs(self, system): """ Collect everything needed for a system. By default the connection is done via SSH tunnels. Arguments: system (str): Open an SFTP session to system and collect the CSVs Return: ``pandas.DataFrame`` """ # TODO: allow parallel (data | log) collection try: self.logger.info('{0} | Collecting statistics...'.format(system)) if (self._check_if_using_gateway(system) and not self.check_if_tunnel_is_up(system)): self.logger.error('{0} | System not reachable!'.format(system)) raise SFTPSessionError # Get an sftp session sftp_session = self.get_sftp_session(system) if not sftp_session: raise SFTPSessionError( 'Cannot open an SFTP session to {0}'.format(system)) with sftp_session as session: # open the session # Get data from the remote system result_data = self.get_system_data(session, system) # Done gathering data, now get the logs if self.nologs or result_data.empty \ or not self.conf.has_option('MISC', 'remote_log_cmd'): result_logs = '{0} | Log collection omitted'.format(system) self.logger.info(result_logs) else: result_logs = self.get_system_logs( sftp_session.ssh_transport, system, self.conf.get('MISC', 'remote_log_cmd') ) or '{0} | Missing logs!'.format(system) except (IOError, SFTPSessionError): result_data = pd.DataFrame() result_logs = 'Could not get information from this system' self.logger.debug('{0} | Consolidating results'.format(system)) self.data = df_tools.consolidate_data(result_data, dataframe=self.data, system=system) self.logs[system] = result_logs self.results_queue.put(system)
def test_reports_generator(self): """ Test function for Orchestrator._reports_generator() """ _orchestrator = self.orchestrator_test.clone() _orchestrator.data = self.test_data _orchestrator._reports_generator() self.assertNotEqual(_orchestrator.reports_written, []) for report_file in _orchestrator.reports_written: self.assertTrue(os.path.exists(report_file)) # Test the non-threaded version _orchestrator.reports_written = [] # reset the count _orchestrator.safe = True _orchestrator.data = consolidate_data(partial_dataframe=self.test_data, dataframe=self.test_data, system='SYS2') _orchestrator._reports_generator() self.assertNotEqual(_orchestrator.reports_written, []) self.assertEqual(len(_orchestrator.reports_written), 2) for report_file in _orchestrator.reports_written: self.assertTrue(os.path.exists(report_file))
def get_stats_from_host(self, filespec_list=None, hostname=None, compressed=False, sftp_session=None, **kwargs): """ Optionally connect to a remote system via SFTP to read CSV files, which might be compressed in ZIP files, then call the CSV-pandas conversion function. Arguments: filespec_list (Optional[list]): List of strings, each representing a valid file specification (wildcards (``*``) allowed hostname (Optional[str]): Remote hostname where to download the CSV files. Default: working with local filesystem compressed (Optional[boolean]): Whether or not the files matching ``filespec_list`` are compressed (deflate) Default: ``False`` (not compressed) sftp_session (Optional[paramiko.SFTPClient]): SFTP session to the remote ``hostname`` Default: ``None`` (work with local filesystem) files_folder (Optional[str]): folder where files are located, either on sftp server or local filesystem Return: ``pandas.DataFrame`` """ _df = pd.DataFrame() files = self.files_lookup(hostname=hostname, filespec_list=filespec_list, compressed=compressed, sftp_session=sftp_session, **kwargs) if not files: self.logger.debug('Nothing gathered from {0}, no files were ' 'selected for pattern "{1}"' .format(hostname or 'local system', filespec_list)) return _df progressbar_prefix = 'Loading {0}files{1}'.format( 'compressed ' if compressed else '', ' from {0}'.format(hostname) if hostname else '' ) for a_file in tqdm.tqdm(files, leave=True, desc=progressbar_prefix, disable=compressed, unit='Archive' if compressed else 'File'): if compressed: _df = _df.combine_first( self._load_zipfile(zip_file=a_file, sftp_session=sftp_session) ) # if no hostname, try to infer it from the file name regex = 't4_(\w+)[0-9]_\w+_[0-9]{{4}}_[0-9]{{4}}_\w+.{0}'.\ format(os.path.splitext(a_file)[-1]) if not hostname and re.search(regex, a_file): hostname = re.search(regex, a_file).groups()[0] if hostname: _df = df_tools.consolidate_data(_df, system=hostname) else: _df = _df.combine_first( df_tools.dataframize(data_file=a_file, session=sftp_session, logger=self.logger) ) return _df
def get_stats_from_host(self, filespec_list=None, hostname=None, compressed=False, sftp_session=None, **kwargs): """ Optionally connect to a remote system via SFTP to read CSV files, which might be compressed in ZIP files, then call the CSV-pandas conversion function. Arguments: filespec_list (Optional[list]): List of strings, each representing a valid file specification (wildcards (``*``) allowed hostname (Optional[str]): Remote hostname where to download the CSV files. Default: working with local filesystem compressed (Optional[boolean]): Whether or not the files matching ``filespec_list`` are compressed (deflate) Default: ``False`` (not compressed) sftp_session (Optional[paramiko.SFTPClient]): SFTP session to the remote ``hostname`` Default: ``None`` (work with local filesystem) files_folder (Optional[str]): folder where files are located, either on sftp server or local filesystem Return: ``pandas.DataFrame`` """ _df = pd.DataFrame() files = self.files_lookup(hostname=hostname, filespec_list=filespec_list, compressed=compressed, sftp_session=sftp_session, **kwargs) if not files: self.logger.debug('Nothing gathered from {0}, no files were ' 'selected for pattern "{1}"'.format( hostname or 'local system', filespec_list)) return _df progressbar_prefix = 'Loading {0}files{1}'.format( 'compressed ' if compressed else '', ' from {0}'.format(hostname) if hostname else '') tqdm_call = tqdm.tqdm_notebook if is_running_from_ipython() \ else tqdm.tqdm for a_file in tqdm_call(files, leave=True, desc=progressbar_prefix, disable=compressed, unit='Archive' if compressed else 'File'): if compressed: _df = _df.combine_first( self._load_zipfile(zip_file=a_file, sftp_session=sftp_session)) # if no hostname, try to infer it from the file name regex = 't4_(\w+)[0-9]_\w+_[0-9]{{4}}_[0-9]{{4}}_\w+.{0}'.\ format(os.path.splitext(a_file)[-1]) if not hostname and re.search(regex, a_file): hostname = re.search(regex, a_file).groups()[0] if hostname: _df = df_tools.consolidate_data(_df, system=hostname) else: _df = _df.combine_first( df_tools.dataframize(data_file=a_file, session=sftp_session, logger=self.logger)) return _df