示例#1
0
    def test_consolidate_data(self):
        """ Test dataframe consolidation function """
        midx = pd.MultiIndex(levels=[[0, 1, 2, 3, 4], ['sys1']],
                             labels=[[0, 1, 2, 3, 4], [0, 0, 0, 0, 0]],
                             names=[df_tools.DATETIME_TAG, 'system'])

        df1 = pd.DataFrame(np.random.randint(0, 10, (5, 3)),
                           columns=['A', 'B', 'C'])
        df1.index.name = df_tools.DATETIME_TAG

        df1_midx = df1.set_index(midx)

        df2 = pd.DataFrame(np.random.randint(0, 10, (5, 3)),
                           columns=['A', 'B', 'C'])
        df2.index.name = df_tools.DATETIME_TAG

        # Consolidate with nothing should raise a ToDfError
        with self.assertRaises(df_tools.ToDfError):
            df_tools.consolidate_data(df1)

        # Consolidate with a system name should return MultiIndex dataframe
        assert_frame_equal(df_tools.consolidate_data(df1.copy(),
                                                     system='sys1'),
                           df1_midx)

        data = pd.DataFrame()
        for (i, partial_dataframe) in enumerate([df1, df2]):
            data = df_tools.consolidate_data(partial_dataframe,
                                             dataframe=data,
                                             system='sys{}'.format(i + 1))
        self.assertTupleEqual(data.shape, (10, 3))
        self.assertTupleEqual(data.index.levshape, (5, 2))

        assert_frame_equal(df1, data.xs('sys1', level='system'))
        assert_frame_equal(df2, data.xs('sys2', level='system'))
示例#2
0
    def test_plotvar(self):
        """ Test function for plot_var """
        dataframe = df_tools.consolidate_data(self.test_data, system='SYSTEM1')
        # make a plot filtering by system, uses dataframe.plot()
        myplot = gen_plot.plot_var(dataframe,
                                   'FRONTEND_11_OUTPUT_OK',
                                   system='SYSTEM1',
                                   logger=self.logger)
        self.assertTrue(myplot.has_data())
        self.assertTrue(myplot.is_figure_set())

        # make a plot without filters, uses matplotlib.pyplot.plot()
        myplot = gen_plot.plot_var(dataframe,
                                   'FRONTEND_11_OUTPUT_OK',
                                   logger=self.logger)
        self.assertTrue(myplot.has_data())
        self.assertTrue(myplot.is_figure_set())

        # Selecting a non existing system should return an empty plot
        voidplot = gen_plot.plot_var(dataframe,
                                     'FRONTEND_11_OUTPUT_OK',
                                     system='SYSTEM2',
                                     logger=self.logger)
        self.assertFalse(voidplot.has_data())

        # now with an empty dataframe, should return None
        voidplot = gen_plot.plot_var(pd.DataFrame(),
                                     'DONTCARE',
                                     logger=self.logger)
        self.assertFalse(voidplot.has_data())

        # same when trying to plot a non-existing variable
        voidplot = gen_plot.plot_var(dataframe, 'DONTCARE', logger=self.logger)
        self.assertFalse(voidplot.has_data())
示例#3
0
    def test_dataframe_to_t4csv(self):
        """ Test reverse conversion (dataframe to T4-CSV) """

        t = tempfile.NamedTemporaryFile()
        t4files = df_tools.dataframe_to_t4csv(dataframe=self.test_data,
                                              output=t.name)
        self.assertTrue(len(t4files) > 0)
        that = self.collector_test.get_stats_from_host(list(t4files.values()))
        that = df_tools.consolidate_data(that, system=list(t4files.keys())[0])
        assert_frame_equal(self.test_data, that)
示例#4
0
 def _single_day_and_system_data(system, given_date=None):
     given_date = get_datetag(given_date)
     self.logger.info(
         'Collecting data for system: {0}; day: {1}'.format(
             system, given_date))
     with self.get_sftp_session(system) as session:
         result_data = self.get_system_data(session, system, given_date)
         self.data = df_tools.consolidate_data(result_data,
                                               dataframe=self.data,
                                               system=system)
         self.results_queue.put(system)  # flag this system as done
示例#5
0
 def _single_day_and_system_data(system, given_date=None):
     given_date = get_datetag(given_date)
     self.logger.info('Collecting data for system: {0}; day: {1}'
                      .format(system, given_date))
     with self.get_sftp_session(system) as session:
         result_data = self.get_system_data(session,
                                            system,
                                            given_date)
         self.data = df_tools.consolidate_data(result_data,
                                               dataframe=self.data,
                                               system=system)
         self.results_queue.put(system)  # flag this system as done
示例#6
0
    def create_reports_from_local(self,
                                  data_file,
                                  pkl=True,
                                  plain=False,
                                  system=None,
                                  **kwargs):
        """
        Generate HTML files from data stored locally

        Arguments:
            data_file (str):
                Data filename
        Keyword Arguments:
            pkl (boolean or True):
                indicate if data is a pickled dataframe or a CSV
            plain (boolean or False):
                when ``pkl==False``, indicate if the CSV is a plain (aka excel
                format) or a T4-compliant CSV
            system (str):
                Indicate the system name of the input data, important when data
                comes in CSV format (``pkl==False``)
        """
        # load the input file
        if not os.path.exists(data_file):
            self.logger.error('{0} file {1} cannot be found'
                              .format('PKL' if pkl else 'CSV', data_file))
            raise IOError
        if pkl:
            _collector = collector.read_pickle(data_file, logger=self.logger)
            self.data = _collector.data
            self.logs = _collector.logs
            if system:
                self.systems = system if isinstance(system, list) else [system]
            else:
                self.systems = _collector.systems
        else:  # CSV
            if not system:
                system = os.path.splitext(os.path.basename(data_file))[0]
            self.data = df_tools.reload_from_csv(data_file,
                                                 plain=plain)
            self.data = df_tools.consolidate_data(self.data,
                                                  system=system)
            self.systems = system if isinstance(system, list) else [system]
        # Populate the log info with fake data
        for system in self.systems:
            self.logs[system] = 'Log collection omitted for '\
                                'locally generated reports at '\
                                '{0} for {1}'.format(self.date_tag(), system)
            self.logger.info(self.logs[system])

        # Create the reports
        self._reports_generator()
        self.logger.info('Done!')
    def test_getstats(self):
        """ Test function for get_stats_from_host """
        df1 = self.collector_test.get_stats_from_host(filespec_list=TEST_CSV)
        col = collector.read_pickle(TEST_PKL)
        df2 = col.data
        df2.clean_calcs(TEST_CALC)  # undo calculations
        df1 = df_tools.consolidate_data(
            df1, system=df2.index.get_level_values('system').unique()[0])

        self.assertIsInstance(df1, pd.DataFrame)
        self.assertIsInstance(df2, pd.DataFrame)
        assert_frame_equal(df1, df2)
示例#8
0
    def get_data_and_logs(self, system):
        """
        Collect everything needed for a system.
        By default the connection is done via SSH tunnels.

        Arguments:
            system (str): Open an SFTP session to system and collect the CSVs
        Return:
            ``pandas.DataFrame``

        """
        # TODO: allow parallel (data | log) collection
        try:
            self.logger.info('{0} | Collecting statistics...'.format(system))
            if (
                self._check_if_using_gateway(system) and not
                self.check_if_tunnel_is_up(system)
            ):
                self.logger.error('{0} | System not reachable!'.format(system))
                raise SFTPSessionError

            # Get an sftp session
            sftp_session = self.get_sftp_session(system)
            if not sftp_session:
                raise SFTPSessionError('Cannot open an SFTP session to {0}'
                                       .format(system))
            with sftp_session as session:  # open the session
                # Get data from the remote system
                result_data = self.get_system_data(session, system)
                # Done gathering data, now get the logs
                if self.nologs or result_data.empty \
                   or not self.conf.has_option('MISC', 'remote_log_cmd'):
                    result_logs = '{0} | Log collection omitted'.format(system)
                    self.logger.info(result_logs)
                else:
                    result_logs = self.get_system_logs(
                        sftp_session.ssh_transport,
                        system,
                        self.conf.get('MISC', 'remote_log_cmd')
                    ) or '{0} | Missing logs!'.format(system)
        except (IOError, SFTPSessionError):
            result_data = pd.DataFrame()
            result_logs = 'Could not get information from this system'

        self.logger.debug('{0} | Consolidating results'.format(system))
        self.data = df_tools.consolidate_data(result_data,
                                              dataframe=self.data,
                                              system=system)
        self.logs[system] = result_logs
        self.results_queue.put(system)
示例#9
0
    def get_data_and_logs(self, system):
        """
        Collect everything needed for a system.
        By default the connection is done via SSH tunnels.

        Arguments:
            system (str): Open an SFTP session to system and collect the CSVs
        Return:
            ``pandas.DataFrame``

        """
        # TODO: allow parallel (data | log) collection
        try:
            self.logger.info('{0} | Collecting statistics...'.format(system))
            if (self._check_if_using_gateway(system)
                    and not self.check_if_tunnel_is_up(system)):
                self.logger.error('{0} | System not reachable!'.format(system))
                raise SFTPSessionError

            # Get an sftp session
            sftp_session = self.get_sftp_session(system)
            if not sftp_session:
                raise SFTPSessionError(
                    'Cannot open an SFTP session to {0}'.format(system))
            with sftp_session as session:  # open the session
                # Get data from the remote system
                result_data = self.get_system_data(session, system)
                # Done gathering data, now get the logs
                if self.nologs or result_data.empty \
                   or not self.conf.has_option('MISC', 'remote_log_cmd'):
                    result_logs = '{0} | Log collection omitted'.format(system)
                    self.logger.info(result_logs)
                else:
                    result_logs = self.get_system_logs(
                        sftp_session.ssh_transport, system,
                        self.conf.get('MISC', 'remote_log_cmd')
                    ) or '{0} | Missing logs!'.format(system)
        except (IOError, SFTPSessionError):
            result_data = pd.DataFrame()
            result_logs = 'Could not get information from this system'

        self.logger.debug('{0} | Consolidating results'.format(system))
        self.data = df_tools.consolidate_data(result_data,
                                              dataframe=self.data,
                                              system=system)
        self.logs[system] = result_logs
        self.results_queue.put(system)
示例#10
0
 def test_reports_generator(self):
     """ Test function for Orchestrator._reports_generator() """
     _orchestrator = self.orchestrator_test.clone()
     _orchestrator.data = self.test_data
     _orchestrator._reports_generator()
     self.assertNotEqual(_orchestrator.reports_written, [])
     for report_file in _orchestrator.reports_written:
         self.assertTrue(os.path.exists(report_file))
     # Test the non-threaded version
     _orchestrator.reports_written = []  # reset the count
     _orchestrator.safe = True
     _orchestrator.data = consolidate_data(partial_dataframe=self.test_data,
                                           dataframe=self.test_data,
                                           system='SYS2')
     _orchestrator._reports_generator()
     self.assertNotEqual(_orchestrator.reports_written, [])
     self.assertEqual(len(_orchestrator.reports_written), 2)
     for report_file in _orchestrator.reports_written:
         self.assertTrue(os.path.exists(report_file))
示例#11
0
    def test_plotvar(self):
        """ Test function for plot_var """
        dataframe = df_tools.consolidate_data(self.test_data, system='SYSTEM1')
        # make a plot filtering by system, uses dataframe.plot()
        myplot = gen_plot.plot_var(dataframe,
                                   'FRONTEND_11_OUTPUT_OK',
                                   system='SYSTEM1',
                                   logger=self.logger)
        self.assertTrue(myplot.has_data())
        self.assertTrue(myplot.is_figure_set())

        # make a plot without filters, uses matplotlib.pyplot.plot()
        myplot = gen_plot.plot_var(dataframe,
                                   'FRONTEND_11_OUTPUT_OK',
                                   logger=self.logger)
        self.assertTrue(myplot.has_data())
        self.assertTrue(myplot.is_figure_set())

        # Selecting a non existing system should return an empty plot
        voidplot = gen_plot.plot_var(dataframe,
                                     'FRONTEND_11_OUTPUT_OK',
                                     system='SYSTEM2',
                                     logger=self.logger)
        self.assertFalse(voidplot.has_data())

        # now with an empty dataframe, should return None
        voidplot = gen_plot.plot_var(pd.DataFrame(),
                                     'DONTCARE',
                                     logger=self.logger)
        self.assertFalse(voidplot.has_data())

        # same when trying to plot a non-existing variable
        voidplot = gen_plot.plot_var(dataframe,
                                     'DONTCARE',
                                     logger=self.logger)
        self.assertFalse(voidplot.has_data())
示例#12
0
    def get_stats_from_host(self,
                            filespec_list=None,
                            hostname=None,
                            compressed=False,
                            sftp_session=None,
                            **kwargs):
        """
        Optionally connect to a remote system via SFTP to read CSV files, which
        might be compressed in ZIP files, then call the CSV-pandas conversion
        function.

        Arguments:
            filespec_list (Optional[list]):
                List of strings, each representing a valid file specification
                (wildcards (``*``) allowed
            hostname (Optional[str]):
                Remote hostname where to download the CSV files.
                Default: working with local filesystem
            compressed (Optional[boolean]):
                Whether or not the files matching ``filespec_list`` are
                compressed (deflate)
                Default: ``False`` (not compressed)
            sftp_session (Optional[paramiko.SFTPClient]):
                SFTP session to the remote ``hostname``
                Default: ``None`` (work with local filesystem)
            files_folder (Optional[str]):
                folder where files are located, either on sftp server or local
                filesystem
        Return:
            ``pandas.DataFrame``
        """
        _df = pd.DataFrame()

        files = self.files_lookup(hostname=hostname,
                                  filespec_list=filespec_list,
                                  compressed=compressed,
                                  sftp_session=sftp_session,
                                  **kwargs)
        if not files:
            self.logger.debug('Nothing gathered from {0}, no files were '
                              'selected for pattern "{1}"'
                              .format(hostname or 'local system',
                                      filespec_list))
            return _df

        progressbar_prefix = 'Loading {0}files{1}'.format(
            'compressed ' if compressed else '',
            ' from {0}'.format(hostname) if hostname else ''
        )
        for a_file in tqdm.tqdm(files,
                                leave=True,
                                desc=progressbar_prefix,
                                disable=compressed,
                                unit='Archive' if compressed else 'File'):
            if compressed:
                _df = _df.combine_first(
                    self._load_zipfile(zip_file=a_file,
                                       sftp_session=sftp_session)
                )
                # if no hostname, try to infer it from the file name
                regex = 't4_(\w+)[0-9]_\w+_[0-9]{{4}}_[0-9]{{4}}_\w+.{0}'.\
                    format(os.path.splitext(a_file)[-1])
                if not hostname and re.search(regex, a_file):
                    hostname = re.search(regex, a_file).groups()[0]

                if hostname:
                    _df = df_tools.consolidate_data(_df,
                                                    system=hostname)

            else:
                _df = _df.combine_first(
                    df_tools.dataframize(data_file=a_file,
                                         session=sftp_session,
                                         logger=self.logger)
                )
        return _df
示例#13
0
    def get_stats_from_host(self,
                            filespec_list=None,
                            hostname=None,
                            compressed=False,
                            sftp_session=None,
                            **kwargs):
        """
        Optionally connect to a remote system via SFTP to read CSV files, which
        might be compressed in ZIP files, then call the CSV-pandas conversion
        function.

        Arguments:
            filespec_list (Optional[list]):
                List of strings, each representing a valid file specification
                (wildcards (``*``) allowed
            hostname (Optional[str]):
                Remote hostname where to download the CSV files.
                Default: working with local filesystem
            compressed (Optional[boolean]):
                Whether or not the files matching ``filespec_list`` are
                compressed (deflate)
                Default: ``False`` (not compressed)
            sftp_session (Optional[paramiko.SFTPClient]):
                SFTP session to the remote ``hostname``
                Default: ``None`` (work with local filesystem)
            files_folder (Optional[str]):
                folder where files are located, either on sftp server or local
                filesystem
        Return:
            ``pandas.DataFrame``
        """
        _df = pd.DataFrame()

        files = self.files_lookup(hostname=hostname,
                                  filespec_list=filespec_list,
                                  compressed=compressed,
                                  sftp_session=sftp_session,
                                  **kwargs)
        if not files:
            self.logger.debug('Nothing gathered from {0}, no files were '
                              'selected for pattern "{1}"'.format(
                                  hostname or 'local system', filespec_list))
            return _df

        progressbar_prefix = 'Loading {0}files{1}'.format(
            'compressed ' if compressed else '',
            ' from {0}'.format(hostname) if hostname else '')
        tqdm_call = tqdm.tqdm_notebook if is_running_from_ipython() \
                    else tqdm.tqdm
        for a_file in tqdm_call(files,
                                leave=True,
                                desc=progressbar_prefix,
                                disable=compressed,
                                unit='Archive' if compressed else 'File'):
            if compressed:
                _df = _df.combine_first(
                    self._load_zipfile(zip_file=a_file,
                                       sftp_session=sftp_session))
                # if no hostname, try to infer it from the file name
                regex = 't4_(\w+)[0-9]_\w+_[0-9]{{4}}_[0-9]{{4}}_\w+.{0}'.\
                    format(os.path.splitext(a_file)[-1])
                if not hostname and re.search(regex, a_file):
                    hostname = re.search(regex, a_file).groups()[0]

                if hostname:
                    _df = df_tools.consolidate_data(_df, system=hostname)

            else:
                _df = _df.combine_first(
                    df_tools.dataframize(data_file=a_file,
                                         session=sftp_session,
                                         logger=self.logger))
        return _df