Пример #1
0
 def test_chaining_start(self):
     """Test chaining start with constructor
     """
     spinner = HaloNotebook().start()
     spinner_id = spinner.spinner_id
     self.assertIsNotNone(spinner_id)
     spinner.stop()
Пример #2
0
def add_features(input_file, output_file, force):
    """ Runs build features scripts to turn processed data from (../processed) into
        improved data (saved in ../processed as well).

        Parameters
        ----------
        input_file: str
            Input file to be processed
        output_file: str
            Output processed file
        force: bool
            Force to process the input file
    """
    spinner = Halo(text='Building features...', spinner='dots')

    clean_data = pd.read_csv(input_file)

    # Add lat/lon columns
    if force or not os.path.exists(output_file):
        spinner.start("Adding Latitude and Longitude columns")
        transformed_data = apply_nomatin(clean_data)
        transformed_data.to_csv(output_file, index=False)
        spinner.succeed("Latitude and Longitude features added!")
    else:
        spinner.start("Loading transformed file...")
        time.sleep(2)
        transformed_data = pd.read_csv(output_file)
        spinner.stop_and_persist(text="Transformed file already exists!")

    # Combine features
    transformed_data = combine_features(transformed_data)

    transformed_data.to_csv(output_file, index=False)

    return transformed_data
Пример #3
0
    def anonymize(self, columns_to_exclude=[]):
        """
		Anonymize the dataframe in a manner that leaves all pre-learning and post-learning analyses (including data valuation, variable selection, model-driven improvability, data-driven improvability and model explanation) invariant.

		Any transformation on continuous variables that preserves ranks will not change our pre-learning and post-learning analyses. The same holds for any 1-to-1 transformation on categorical variables.

		This implementation replaces ordinal values (i.e. any column that can be cast as a float) with their within-column Gaussian score. For each non-ordinal column, we form the set of all possible values, we assign a unique integer index to each value in the set, and we systematically replace said value appearing in the dataframe by the hexadecimal code of its associated integer index. 

		For regression problems, accurate estimation of RMSE related metrics require the target column (and the prediction column for post-learning analyses) not to be anonymized.


		Parameters
		----------
		columns_to_exclude: list (optional)
			List of columns not to anonymize (e.g. target and prediction columns for regression problems).


		Returns
		-------
		result : pandas.DataFrame
			The result is a pandas.Dataframe with columns (where applicable):
		"""
        spinner = Halo(text='Preparing data upload', spinner='dots')
        spinner.start()
        df = self._obj.copy()
        for col in df.columns:
            if col in columns_to_exclude:
                continue

            if df.kxy.is_categorical(col) or df[col].dtype.name == 'category':
                # Note: By using 'category' as dtype you are implicitly telling us that the 'natural'
                # order of values does not matter.
                unique_values = list(sorted(set(list(df[col].values))))
                mapping = {
                    unique_values[i]: "0x{:03x}".format(i)
                    for i in range(len(unique_values))
                }
                df[col] = df[col].apply(lambda x: mapping.get(x))
            else:
                # Note: Any monotonic transformation applied to any continuous column would work.
                # The gaussian scoring below makes no assumption on marginals whatsoever.
                x = df[col].values.astype(float)
                x = x - np.nanmean(x)
                s = np.nanstd(x)
                if s > 0.0:
                    x = x / s
                    x = norm.cdf(x)
                df[col] = np.around(x.copy(), 3)
        spinner.succeed()

        return df
Пример #4
0
    def test_unavailable_spinner_defaults(self):
        """Test unavailable spinner defaults.
        """
        spinner = HaloNotebook('dot')

        self.assertEqual(spinner.text, 'dot')
        self.assertEqual(spinner.spinner, default_spinner)
Пример #5
0
 def test_ignore_multiple_start_calls(self):
     """Test ignoring of multiple start calls.
     """
     spinner = HaloNotebook()
     spinner.start()
     spinner_id = spinner.spinner_id
     spinner.start()
     self.assertEqual(spinner.spinner_id, spinner_id)
     spinner.stop()
Пример #6
0
 def __init__(self, text):
     """ 초기화
     Parameters
     ----------
     text: str
         spinner 사용시 표시할 text
     """
     self.spinner = Halo(text=text, spinner='dots')
Пример #7
0
    def test_info(self):
        """Test info method
        """
        spinner = HaloNotebook()
        spinner.start('foo')
        spinner.info()

        output = self._get_test_output(spinner)
        pattern = re.compile(r'(ℹ|¡) foo', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
        spinner.stop()
Пример #8
0
    def test_succeed(self):
        """Test succeed method
        """
        spinner = HaloNotebook()
        spinner.start('foo')
        spinner.succeed('foo')

        output = self._get_test_output(spinner)
        pattern = re.compile(r'(✔|v) foo', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
        spinner.stop()
Пример #9
0
    def test_succeed_with_new_text(self):
        """Test succeed method with new text
        """
        spinner = HaloNotebook()
        spinner.start('foo')
        spinner.succeed('bar')

        output = self._get_test_output(spinner)['text']
        pattern = re.compile(r'(✔|v) bar', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
        spinner.stop()
Пример #10
0
    def test_fail(self):
        """Test fail method
        """
        spinner = HaloNotebook()
        spinner.start('foo')
        spinner.fail()

        output = self._get_test_output(spinner)['text']
        pattern = re.compile(r'(✖|×) foo', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
        spinner.stop()
Пример #11
0
    def test_if_enabled(self):
        """Test if spinner is enabled
        """
        spinner = HaloNotebook(text="foo", enabled=False)
        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)['text']
        spinner.clear()
        spinner.stop()

        self.assertEqual(len(output), 0)
        self.assertEqual(output, [])
Пример #12
0
    def test_warning(self):
        """Test warn method
        """
        spinner = HaloNotebook()
        spinner.start('foo')
        spinner.warn('Warning!')

        output = self._get_test_output(spinner)['text']
        pattern = re.compile(r'(⚠|!!) Warning!', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
        spinner.stop()
Пример #13
0
    def test_context_manager(self):
        """Test the basic of basic spinners used through the with statement.
        """
        with HaloNotebook(text='foo', spinner='dots') as spinner:
            time.sleep(1)
            output = self._get_test_output(spinner)['text']

        self.assertEqual(output[0], '{} foo'.format(frames[0]))
        self.assertEqual(output[1], '{} foo'.format(frames[1]))
        self.assertEqual(output[2], '{} foo'.format(frames[2]))
        self.assertEqual(spinner.output.outputs, spinner._output(''))
Пример #14
0
class Spinner:
    """
    Halo 라이브러리를 이용한 Spinner
    """
    def __init__(self, text):
        """ 초기화
        Parameters
        ----------
        text: str
            spinner 사용시 표시할 text
        """
        self.spinner = Halo(text=text, spinner='dots')

    def start(self):
        """ Spinner Start"""
        self.spinner.start()

    def stop(self):
        """ Spinner Stop """
        self.spinner.stop()
Пример #15
0
    def test_text_animation(self):
        """Test the text gets animated when it is too long
        """
        text = 'This is a text that it is too long. In fact, it exceeds the eighty column standard ' \
               'terminal width, which forces the text frame renderer to add an ellipse at the end of the ' \
               'text. ' * 6
        spinner = HaloNotebook(text=text, spinner='dots', animation='marquee')

        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)

        terminal_width = get_terminal_columns()

        self.assertEqual(
            output[0], '{0} {1}'.format(frames[0], text[:terminal_width - 2]))
        self.assertEqual(
            output[1], '{0} {1}'.format(frames[1], text[1:terminal_width - 1]))
        self.assertEqual(output[2], '{0} {1}'.format(frames[2],
                                                     text[2:terminal_width]))

        spinner.succeed('End!')
        output = self._get_test_output(spinner)

        pattern = re.compile(r'(✔|v) End!', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
Пример #16
0
    def test_text_ellipsing(self):
        """Test the text gets ellipsed if it's too long
        """
        text = 'This is a text that it is too long. In fact, it exceeds the eighty column standard ' \
               'terminal width, which forces the text frame renderer to add an ellipse at the end of the ' \
               'text. ' * 6
        spinner = HaloNotebook(text=text, spinner='dots')

        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)['text']

        terminal_width = get_terminal_columns()

        # -6 of the ' (...)' ellipsis, -2 of the spinner and space
        self.assertEqual(output[0], '{} {} (...)'.format(frames[0], text[:terminal_width - 6 - 2]))
        self.assertEqual(output[1], '{} {} (...)'.format(frames[1], text[:terminal_width - 6 - 2]))
        self.assertEqual(output[2], '{} {} (...)'.format(frames[2], text[:terminal_width - 6 - 2]))

        spinner.succeed('End!')
        output = self._get_test_output(spinner)['text']

        pattern = re.compile(r'(✔|v) End!', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
Пример #17
0
    def test_basic_spinner(self):
        """Test the basic of basic spinners.
        """
        spinner = HaloNotebook(text='foo', spinner='dots')

        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)['text']
        spinner.stop()

        self.assertEqual(output[0], '{} foo'.format(frames[0]))
        self.assertEqual(output[1], '{} foo'.format(frames[1]))
        self.assertEqual(output[2], '{} foo'.format(frames[2]))
        self.assertEqual(spinner.output.outputs, spinner._output(''))
Пример #18
0
    def test_invalid_placement(self):
        """Test invalid placement of spinner.
        """

        with self.assertRaises(ValueError):
            HaloNotebook(placement='')
            HaloNotebook(placement='foo')
            HaloNotebook(placement=None)

        spinner = HaloNotebook(placement='left')
        with self.assertRaises(ValueError):
            spinner.placement = ''
            spinner.placement = 'foo'
            spinner.placement = None
Пример #19
0
    def test_initial_title_spinner(self):
        """Test Halo with initial title.
        """
        spinner = HaloNotebook('bar')

        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)['text']
        spinner.stop()

        self.assertEqual(output[0], '{} bar'.format(frames[0]))
        self.assertEqual(output[1], '{} bar'.format(frames[1]))
        self.assertEqual(output[2], '{} bar'.format(frames[2]))
        self.assertEqual(spinner.output.outputs, spinner._output(''))
Пример #20
0
    def test_right_placement(self):
        """Test right placement of spinner.
        """
        spinner = HaloNotebook(text="foo", placement="right")
        spinner.start()
        time.sleep(1)

        output = self._get_test_output(spinner)['text']
        (text, _) = output[-1].split(" ")
        self.assertEqual(text, "foo")

        spinner.succeed()
        output = self._get_test_output(spinner)['text']
        (text, symbol) = output[-1].split(" ")
        pattern = re.compile(r"(✔|v)", re.UNICODE)

        self.assertEqual(text, "foo")
        self.assertRegexpMatches(symbol, pattern)
        spinner.stop()
Пример #21
0
    def test_spinner_color(self):
        """Test ANSI escape characters are present
        """

        for color, color_int in COLORS.items():
            spinner = HaloNotebook(color=color)
            spinner.start()
            output = self._get_test_output(spinner, no_ansi=False)
            spinner.stop()

            output_merged = [arr for c in output['colors'] for arr in c]

            self.assertEquals(str(color_int) in output_merged, True)
Пример #22
0
    def test_text_stripping(self):
        """Test the text being stripped before output.
        """
        spinner = HaloNotebook(text='foo\n', spinner='dots')

        spinner.start()
        time.sleep(1)
        output = self._get_test_output(spinner)['text']

        self.assertEqual(output[0], '{} foo'.format(frames[0]))
        self.assertEqual(output[1], '{} foo'.format(frames[1]))
        self.assertEqual(output[2], '{} foo'.format(frames[2]))

        spinner.succeed('foo\n')
        output = self._get_test_output(spinner)['text']

        pattern = re.compile(r'(✔|v) foo', re.UNICODE)

        self.assertRegexpMatches(output[-1], pattern)
Пример #23
0
    def test_text_spinner_color(self):
        """Test basic spinner with available colors color (both spinner and text)
        """
        for color, color_int in COLORS.items():
            spinner = HaloNotebook(text='foo', text_color=color, color=color, spinner='dots')

            spinner.start()
            time.sleep(1)
            output = self._get_test_output(spinner)['colors']
            spinner.stop()

            # check if spinner colors match
            self.assertEqual(color_int, int(output[0][0]))
            self.assertEqual(color_int, int(output[1][0]))
            self.assertEqual(color_int, int(output[2][0]))

            # check if text colors match
            self.assertEqual(color_int, int(output[0][1]))
            self.assertEqual(color_int, int(output[1][1]))
            self.assertEqual(color_int, int(output[2][1]))
Пример #24
0
def model_explanation(data_df,
                      prediction_column,
                      problem_type,
                      snr='auto',
                      file_name=None):
    """
	.. _model-explanation:
	Analyzes the variables that a model relies on the most in a brute-force fashion.
	
	The first variable is the variable the model relies on the most. The second variable is the variable that complements the first variable the most in explaining model decisions etc.

	Running performances should be understood as the performance achievable when trying to guess model predictions using variables with selection order smaller or equal to that of the row.

	When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`prediction_column` is categorical.


	Parameters
	----------
	data_df : pandas.DataFrame
		The pandas DataFrame containing the data.
	prediction_column : str
		The name of the column containing true labels.
	problem_type : None | 'classification' | 'regression'
		The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
	file_name : None | str
		A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why.


	Returns
	-------
	result : pandas.DataFrame
		The result is a pandas.Dataframe with columns (where applicable):

		* :code:`'Selection Order'`: The order in which the associated variable was selected, starting at 1 for the most important variable.
		* :code:`'Variable'`: The column name corresponding to the input variable.
		* :code:`'Running Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a classification model using all variables selected so far, including this one.
		* :code:`'Running Achievable Accuracy'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.
		* :code:`'Running Achievable RMSE'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.


	.. admonition:: Theoretical Foundation

		Section :ref:`a) Model Explanation`.

	"""
    assert prediction_column in data_df.columns, 'The label column should be a column of the dataframe.'
    assert problem_type.lower() in ['classification', 'regression']
    if problem_type.lower() == 'regression':
        assert np.can_cast(data_df[prediction_column],
                           float), 'The prediction column should be numeric'

    k = 0
    kp = 0
    max_k = 100

    file_name = upload_data(data_df, file_name=file_name)
    spinner = Halo(text='Waiting for results from the backend.',
                   spinner='dots')
    spinner.start()

    if file_name:
        job_id = EXPLANATION_JOB_IDS.get(
            (file_name, prediction_column, problem_type), None)
        if job_id:
            api_response = APIClient.route(
             path='/wk/variable-selection', method='POST', \
             file_name=file_name, target_column=prediction_column, \
             problem_type=problem_type, timestamp=int(time()), job_id=job_id, \
             snr=snr)
        else:
            api_response = APIClient.route(
             path='/wk/variable-selection', method='POST', \
             file_name=file_name, target_column=prediction_column, \
             problem_type=problem_type, timestamp=int(time()), snr=snr)

        initial_time = time()
        while api_response.status_code == requests.codes.ok and k < max_k:
            if kp % 2 != 0:
                sleep(2 if kp < 5 else 10 if k < max_k - 4 else 300)
                kp += 1
                k = kp // 2

            else:
                try:
                    response = api_response.json()
                    if 'job_id' in response:
                        job_id = response['job_id']
                        EXPLANATION_JOB_IDS[(file_name, prediction_column,
                                             problem_type)] = job_id
                        sleep(2 if kp < 5 else 10 if k < max_k - 4 else 300)
                        kp += 1
                        k = kp // 2

                        # Note: it is important to pass the job_id to avoid being charged twice for the work.
                        api_response = APIClient.route(
                         path='/wk/variable-selection', method='POST', \
                         file_name=file_name, target_column=prediction_column, \
                         problem_type=problem_type, timestamp=int(time()), job_id=job_id, \
                         snr=snr)

                        try:
                            response = api_response.json()
                            if 'eta' in response:
                                progress_text = '%s%% Completed.' % response[
                                    'progress_pct'] if 'progress_pct' in response else ''
                                spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (
                                    response['eta'], progress_text)
                        except:
                            pass

                    if ('job_id' not in response) or ('selection_order'
                                                      in response):
                        duration = int(time() - initial_time)
                        duration = str(
                            duration) + 's' if duration < 60 else str(
                                duration // 60) + 'min'

                        result = {}

                        if 'selection_order' in response:
                            result['Selection Order'] = response[
                                'selection_order']

                        if 'variable' in response:
                            result['Variable'] = response['variable']

                        if 'r-squared' in response:
                            result['Running Achievable R-Squared'] = response[
                                'r-squared']

                        if 'log-likelihood' in response:
                            result[
                                'Running Achievable Log-Likelihood Per Sample'] = response[
                                    'log-likelihood']

                        if 'rmse' in response and problem_type.lower(
                        ) == 'regression':
                            result['Running Achievable RMSE'] = response[
                                'rmse']

                        if 'accuracy' in response and problem_type.lower(
                        ) == 'classification':
                            result['Running Achievable Accuracy'] = response[
                                'accuracy']

                        result = pd.DataFrame.from_dict(result)

                        if 'selection_order' in response:
                            result.set_index('Selection Order', inplace=True)

                        spinner.text = 'Received results from the backend after %s.' % duration
                        spinner.succeed()
                        return result

                except:
                    logging.exception(
                        '\nModel explanation failed. Last HTTP code: %s, Content: %s'
                        % (api_response.status_code, api_response.content))
                    spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
                    spinner.fail()
                    return None

        if api_response.status_code != requests.codes.ok:
            spinner.text = 'The backend is taking longer than expected. Please try again later'
            spinner.fail()
            try:
                response = api_response.json()
                if 'message' in response:
                    logging.error('\n%s' % response['message'])
            except:
                logging.error(
                    '\nModel explanation failed. Last HTTP code: %s, Content: %s'
                    % (api_response.status_code, api_response.content))

    raise LongerThanExpectedException(
        'The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.'
    )

    return None
Пример #25
0
    def test_default_placement(self):
        """Test default placement of spinner.
        """

        spinner = HaloNotebook()
        self.assertEqual(spinner.placement, 'left')
Пример #26
0
def process_dataset(input_file, output_file, scrape):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).

        Parameters
        ----------
        input_file: str
            Input file to be processed
        output_file: str
            Output processed file
        scrape: bool
            Force the scraping process
    """
    spinner = Halo(text='Making dataset...', spinner='dots')
    logger = logging.getLogger(__name__)
    logger.info('Making final dataset from raw data')
    # Scrape data
    if scrape or not os.path.exists(input_file):
        spinner.start("Scraping data")
        with open('./references/urls.txt', 'r') as f:
            urls = f.readlines()
        scraped_dfs = []
        for url in urls:
            scraped_dfs.append(navigate(url, 1, 500))
        # Save results
        raw_data = pd.concat(scraped_dfs)
        raw_data.to_csv(input_file, index=False)
        spinner.succeed("Data Scrapped!")
    else:
        spinner.succeed("Loading scraped file...")
        raw_data = pd.read_csv(input_file)
        spinner.succeed("Scraped file already exists!")

    # Remove duplicates
    spinner.start("Removing duplicates and invalid values...")
    time.sleep(1)
    interim_data = remove_duplicates_and_na(raw_data)
    interim_data.to_csv(output_file.replace("processed", "interim"),
                        index=False)
    spinner.succeed("Done removing duplicates!")

    # Remove outliers
    spinner.start("Removing outliers and inconsistent values...")
    time.sleep(1)
    final_data = remove_outliers(interim_data)
    final_data.to_csv(output_file, index=False)
    spinner.succeed("Done removing outliers!")
    spinner.start("Cleaning processing done!")
    spinner.stop_and_persist(symbol='✔'.encode('utf-8'),
                             text="Cleaning processing done!")

    return final_data
Пример #27
0
def data_valuation(data_df,
                   target_column,
                   problem_type,
                   snr='auto',
                   include_mutual_information=False,
                   file_name=None):
    """
	.. _data-valuation:
	Estimate the highest performance metrics achievable when predicting the :code:`target_column` using all other columns.

	When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical.


	Parameters
	----------
	data_df : pandas.DataFrame
		The pandas DataFrame containing the data.
	target_column : str
		The name of the column containing true labels.
	problem_type : None | 'classification' | 'regression'
		The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
	include_mutual_information : bool
		Whether to include the mutual information between target and explanatory variables in the result.
	file_name : None | str
		A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why.



	Returns
	-------
	achievable_performance : pandas.Dataframe
		The result is a pandas.Dataframe with columns (where applicable):

		* :code:`'Achievable Accuracy'`: The highest classification accuracy that can be achieved by a model using provided inputs to predict the label.
		* :code:`'Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a model using provided inputs to predict the label.
		* :code:`'Achievable RMSE'`: The lowest Root Mean Square Error that can be achieved by a model using provided inputs to predict the label.		
		* :code:`'Achievable Log-Likelihood Per Sample'`: The highest true log-likelihood per sample that can be achieved by a model using provided inputs to predict the label.


	.. admonition:: Theoretical Foundation

		Section :ref:`1 - Achievable Performance`.
	"""
    assert target_column in data_df.columns, 'The label column should be a column of the dataframe.'
    assert problem_type.lower() in ['classification', 'regression']
    if problem_type.lower() == 'regression':
        assert np.can_cast(data_df[target_column],
                           float), 'The target column should be numeric'

    k = 0
    max_k = 100

    file_name = upload_data(data_df, file_name=file_name)
    spinner = Halo(text='Waiting for results from the backend.',
                   spinner='dots')
    spinner.start()

    if file_name:
        job_id = VALUATION_JOB_IDS.get(
            (file_name, target_column, problem_type, snr), None)

        if job_id:
            api_response = APIClient.route(
             path='/wk/data-valuation', method='POST',
             file_name=file_name, target_column=target_column, \
             problem_type=problem_type, \
             timestamp=int(time()), job_id=job_id, \
             snr=snr)
        else:
            api_response = APIClient.route(
             path='/wk/data-valuation', method='POST', \
             file_name=file_name, target_column=target_column, \
             problem_type=problem_type, timestamp=int(time()), \
             snr=snr)

        initial_time = time()
        while api_response.status_code == requests.codes.ok and k < max_k:
            try:
                response = api_response.json()
                if 'eta' in response:
                    progress_text = '%s%% Completed.' % response[
                        'progress_pct'] if 'progress_pct' in response else ''
                    spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (
                        response['eta'], progress_text)

                if ('job_id' in response) and ('r-squared' not in response):
                    job_id = response['job_id']
                    VALUATION_JOB_IDS[(file_name, target_column, problem_type,
                                       snr)] = job_id
                    k += 1
                    sleep(15.)

                    # Note: it is important to pass the job_id to avoid being charged twice for the same work.
                    api_response = APIClient.route(
                     path='/wk/data-valuation', method='POST',
                     file_name=file_name, target_column=target_column, \
                     problem_type=problem_type, \
                     timestamp=int(time()), job_id=job_id, \
                     snr=snr)

                    try:
                        response = api_response.json()
                        if 'eta' in response:
                            progress_text = '%s%% Completed.' % response[
                                'progress_pct'] if 'progress_pct' in response else ''
                            spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (
                                response['eta'], progress_text)
                    except:
                        pass

                if ('job_id' not in response) or ('r-squared' in response):
                    duration = int(time() - initial_time)
                    duration = str(duration) + 's' if duration < 60 else str(
                        duration // 60) + 'min'

                    result = {}
                    if 'r-squared' in response:
                        result['Achievable R-Squared'] = [
                            response['r-squared']
                        ]

                    if 'log-likelihood' in response:
                        result['Achievable Log-Likelihood Per Sample'] = [
                            response['log-likelihood']
                        ]

                    if 'rmse' in response and problem_type.lower(
                    ) == 'regression':
                        result['Achievable RMSE'] = [response['rmse']]

                    if 'accuracy' in response and problem_type.lower(
                    ) == 'classification':
                        result['Achievable Accuracy'] = [response['accuracy']]

                    if include_mutual_information and 'mi' in response:
                        result['Mutual Information'] = [response['mi']]

                    result = pd.DataFrame.from_dict(result)

                    spinner.text = 'Received results from the backend after %s.' % duration
                    spinner.succeed()

                    return result

            except:
                logging.exception(
                    '\nData valuation failed. Last HTTP code: %s' %
                    api_response.status_code)
                spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
                spinner.fail()
                return None

        if api_response.status_code != requests.codes.ok:
            spinner.text = 'The backend is taking longer than expected. Try again later.'
            spinner.fail()
            try:
                response = api_response.json()
                if 'message' in response:
                    logging.error('\n%s' % response['message'])
            except:
                logging.error('\nData valuation failed. Last HTTP code: %s' %
                              api_response.status_code)

    raise LongerThanExpectedException(
        'The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.'
    )

    return None
Пример #28
0
    def test_spinner_getters_setters(self):
        """Test spinner getters and setters.
        """
        spinner = HaloNotebook()
        self.assertEqual(spinner.text, '')
        self.assertEqual(spinner.color, 'cyan')
        self.assertIsNone(spinner.spinner_id)

        spinner.spinner = 'dots12'
        spinner.text = 'bar'
        spinner.color = 'red'

        self.assertEqual(spinner.text, 'bar')
        self.assertEqual(spinner.color, 'red')

        if is_supported():
            self.assertEqual(spinner.spinner, Spinners['dots12'].value)
        else:
            self.assertEqual(spinner.spinner, default_spinner)

        spinner.spinner = 'dots11'
        if is_supported():
            self.assertEqual(spinner.spinner, Spinners['dots11'].value)
        else:
            self.assertEqual(spinner.spinner, default_spinner)

        spinner.spinner = 'foo_bar'
        self.assertEqual(spinner.spinner, default_spinner)

        # Color is None
        spinner.color = None
        spinner.start()
        spinner.stop()
        self.assertIsNone(spinner.color)
Пример #29
0
 def test_id_not_created_before_start(self):
     """Test Spinner ID not created before start.
     """
     spinner = HaloNotebook()
     self.assertEqual(spinner.spinner_id, None)
Пример #30
0
def data_driven_improvability(data_df, target_column, new_variables, problem_type, snr='auto', file_name=None):
	"""
	.. data-driven-improvability:
	Estimate the potential performance boost that a set of new explanatory variables can bring about.


	Parameters
	----------
	data_df : pandas.DataFrame
		The pandas DataFrame containing the data.
	target_column : str
		The name of the column containing true labels.
	new_variables : list
		The names of the columns to use as new explanatory variables.
	problem_type : None | 'classification' | 'regression'
		The type of supervised learning problem. When None, it is inferred from whether or not :code:`target_column` is categorical.
	file_name : None | str
		A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why.



	Returns
	-------
	result : pandas.Dataframe
		The result is a pandas.Dataframe with columns (where applicable):

		* :code:`'Accuracy Boost'`: The classification accuracy boost that the new explanatory variables can bring about.
		* :code:`'R-Squared Boost'`: The :math:`R^2` boost that the new explanatory variables can bring about.
		* :code:`'RMSE Reduction'`: The reduction in Root Mean Square Error that the new explanatory variables can bring about.
		* :code:`'Log-Likelihood Per Sample Boost'`: The boost in log-likelihood per sample that the new explanatory variables can bring about.


	.. admonition:: Theoretical Foundation

		Section :ref:`3 - Model Improvability`.
		
	"""
	assert target_column in data_df.columns, 'The label column should be a column of the dataframe.'
	assert problem_type.lower() in ['classification', 'regression']
	assert len(new_variables) > 0, 'New variables should be provided'
	for col in new_variables:
		assert col in data_df.columns, '%s should be a column in the dataframe' % col
	if problem_type.lower() == 'regression':
		assert np.can_cast(data_df[target_column], float), 'The target column should be numeric'

	k = 0
	kp = 0
	max_k = 100

	file_name = upload_data(data_df, file_name=file_name)
	spinner = Halo(text='Waiting for results from the backend.', spinner='dots')
	spinner.start()

	if file_name:
		job_id = DD_IMPROVABILITY_JOB_IDS.get((file_name, target_column, str(new_variables), problem_type, snr), None)

		if job_id:
			api_response = APIClient.route(
				path='/wk/data-driven-improvability', method='POST', \
				file_name=file_name, target_column=target_column, \
				problem_type=problem_type, new_variables=json.dumps(new_variables), \
				job_id=job_id, timestamp=int(time()), snr=snr)
		else:
			api_response = APIClient.route(
				path='/wk/data-driven-improvability', method='POST', \
				file_name=file_name, target_column=target_column, \
				problem_type=problem_type, new_variables=json.dumps(new_variables), \
				timestamp=int(time()), snr=snr)


		initial_time = time()
		while api_response.status_code == requests.codes.ok and k < max_k:
			if kp%2 != 0:
				sleep(2 if kp<5 else 10 if k < max_k-4 else 300)
				kp += 1
				k = kp//2

			else:
				try:
					response = api_response.json()
					if 'job_id' in response:
						job_id = response['job_id']
						DD_IMPROVABILITY_JOB_IDS[(file_name, target_column, str(new_variables), problem_type, snr)] = job_id
						sleep(2 if kp<5 else 10 if k < max_k-4 else 300)
						kp += 1
						k = kp//2
						api_response = APIClient.route(
							path='/wk/data-driven-improvability', method='POST', \
							file_name=file_name, target_column=target_column, \
							problem_type=problem_type, new_variables=json.dumps(new_variables), \
							timestamp=int(time()), snr=snr)

						try:
							response = api_response.json()
							if 'eta' in response:
								progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else ''
								spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text)
						except:
							pass

					if ('job_id' not in response) or ('r-squared-boost' in response):
						duration = int(time()-initial_time)
						duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min'
						result = {}
						if 'r-squared-boost' in response:
							result['R-Squared Boost'] = [response['r-squared-boost']]

						if 'log-likelihood-boost' in response:
							result['Log-Likelihood Per Sample Boost'] = [response['log-likelihood-boost']]

						if 'rmse-reduction' in response and problem_type.lower() == 'regression':
							result['RMSE Reduction'] = [response['rmse-reduction']]

						if 'accuracy-boost' in response and problem_type.lower() == 'classification':
							result['Accuracy Boost'] = [response['accuracy-boost']]

						result = pd.DataFrame.from_dict(result)
						spinner.text = 'Received results from the backend after %s' % duration
						spinner.succeed()
						return result

				except:
					spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
					spinner.fail()
					return None

		if api_response.status_code != requests.codes.ok:
			spinner.text = 'The backend is taking longer than expected. Try again later.'
			spinner.fail()
			try:
				response = api_response.json()
				if 'message' in response:
					logging.error('\n%s' % response['message'])
			except:
				logging.error('\nData-driven improvability failed. Last HTTP code: %s' % api_response.status_code)

	return None